diff --git a/app.py b/app.py index abe3a7ffc2e52ec8f7502d2530bb4482742d732a..a687b18fc9debbaa396a0ac1cdbbd829d6bed39c 100644 --- a/app.py +++ b/app.py @@ -1,8 +1,14 @@ -from fastapi import FastAPI -from fastapi.responses import HTMLResponse +from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks +from fastapi.responses import HTMLResponse, FileResponse import os +import tempfile +import shutil +from pathlib import Path +import asyncio +from typing import Dict, Optional +import uuid -app = FastAPI() +app = FastAPI(title="MinerU PDF Converter", version="0.2.0") @app.get("/") async def root(): @@ -59,12 +65,118 @@ async def api_info(): """API information endpoint""" return { "name": "PDF to Markdown Converter API", - "version": "0.1.0", + "version": "0.2.0", "endpoints": { "/": "Main endpoint", "/health": "Health check", "/test": "Test HTML page", "/docs": "FastAPI automatic documentation", - "/api/info": "This endpoint" + "/api/info": "This endpoint", + "/api/convert": "Convert PDF to Markdown (POST)", + "/api/status/{task_id}": "Check conversion status", + "/api/download/{task_id}": "Download converted markdown" } - } \ No newline at end of file + } + +# Store for conversion tasks +conversion_tasks: Dict[str, dict] = {} + +@app.post("/api/convert") +async def convert_pdf( + background_tasks: BackgroundTasks, + file: UploadFile = File(...) +): + """Convert PDF to Markdown""" + if not file.filename.endswith('.pdf'): + raise HTTPException(status_code=400, detail="Only PDF files are supported") + + # Generate unique task ID + task_id = str(uuid.uuid4()) + + # Save uploaded file + temp_dir = Path(tempfile.mkdtemp()) + pdf_path = temp_dir / file.filename + + try: + with open(pdf_path, "wb") as buffer: + shutil.copyfileobj(file.file, buffer) + except Exception as e: + shutil.rmtree(temp_dir) + raise HTTPException(status_code=500, detail=f"Failed to save file: {str(e)}") + + # Initialize task status + conversion_tasks[task_id] = { + "status": "processing", + "filename": file.filename, + "result": None, + "error": None, + "temp_dir": str(temp_dir) + } + + # Start conversion in background + background_tasks.add_task(process_pdf_conversion, task_id, str(pdf_path)) + + return { + "task_id": task_id, + "status": "processing", + "message": "PDF conversion started", + "check_status_url": f"/api/status/{task_id}" + } + +async def process_pdf_conversion(task_id: str, pdf_path: str): + """Process PDF conversion in background""" + try: + # For now, just simulate conversion + await asyncio.sleep(2) # Simulate processing + + # Create a dummy markdown file + output_path = Path(pdf_path).with_suffix('.md') + with open(output_path, 'w') as f: + f.write(f"# Converted from {Path(pdf_path).name}\n\n") + f.write("This is a placeholder conversion. Full MinerU integration coming soon.\n") + + conversion_tasks[task_id]["status"] = "completed" + conversion_tasks[task_id]["result"] = str(output_path) + + except Exception as e: + conversion_tasks[task_id]["status"] = "failed" + conversion_tasks[task_id]["error"] = str(e) + +@app.get("/api/status/{task_id}") +async def get_conversion_status(task_id: str): + """Check conversion status""" + if task_id not in conversion_tasks: + raise HTTPException(status_code=404, detail="Task not found") + + task = conversion_tasks[task_id] + response = { + "task_id": task_id, + "status": task["status"], + "filename": task["filename"] + } + + if task["status"] == "completed": + response["download_url"] = f"/api/download/{task_id}" + elif task["status"] == "failed": + response["error"] = task["error"] + + return response + +@app.get("/api/download/{task_id}") +async def download_converted_file(task_id: str): + """Download converted markdown file""" + if task_id not in conversion_tasks: + raise HTTPException(status_code=404, detail="Task not found") + + task = conversion_tasks[task_id] + if task["status"] != "completed": + raise HTTPException(status_code=400, detail="Conversion not completed") + + if not task["result"] or not Path(task["result"]).exists(): + raise HTTPException(status_code=404, detail="Converted file not found") + + return FileResponse( + task["result"], + media_type="text/markdown", + filename=Path(task["result"]).name + ) \ No newline at end of file diff --git a/config/magic-pdf.json b/config/magic-pdf.json new file mode 100644 index 0000000000000000000000000000000000000000..2c0223db0deb5e54543b92e073bd598e60047d73 --- /dev/null +++ b/config/magic-pdf.json @@ -0,0 +1,9 @@ +{ + "bucket_info":{ + "bucket-name-1":["ak", "sk", "endpoint"], + "bucket-name-2":["ak", "sk", "endpoint"] + }, + "temp-output-dir":"/tmp", + "models-dir":"/tmp/models", + "device-mode":"cpu" +} \ No newline at end of file diff --git a/pdf_converter_mineru.py b/pdf_converter_mineru.py new file mode 100644 index 0000000000000000000000000000000000000000..772c5487d58c4d4ee4ad92192ceb373a01643597 --- /dev/null +++ b/pdf_converter_mineru.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +""" +PDF to Markdown Converter using MinerU (vendor/mineru) +This is the main conversion script that uses the local MinerU installation +""" + +import os +import sys +import logging +import argparse +from pathlib import Path +import subprocess + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(), + logging.FileHandler('pdf_converter.log') + ] +) +logger = logging.getLogger(__name__) + + +class PdfConverterResult: + """Class representing the result of a PDF conversion""" + + def __init__(self, pdf_path: str, success: bool, md_path: str = None, + time_taken: float = 0, error: str = None): + self.pdf_path = pdf_path + self.success = success + self.md_path = md_path + self.time_taken = time_taken + self.error = error + + def __str__(self): + if self.success: + return f"✅ Successfully converted {self.pdf_path} in {self.time_taken:.2f}s" + else: + return f"❌ Failed to convert {self.pdf_path}: {self.error}" + + +class MineruPdfConverter: + """ + PDF to Markdown converter using MinerU + """ + + def __init__(self, output_dir: str = "output"): + self.output_dir = output_dir + os.makedirs(output_dir, exist_ok=True) + + def convert_file(self, pdf_path: str, delete_after: bool = False) -> PdfConverterResult: + """Convert a single PDF file to Markdown using MinerU""" + import time + start_time = time.time() + + try: + pdf_path = Path(pdf_path) + if not pdf_path.exists(): + return PdfConverterResult( + str(pdf_path), False, error=f"File not found: {pdf_path}" + ) + + logger.info(f"Processing: {pdf_path}") + + # Prepare output directory + pdf_output_dir = os.path.join(self.output_dir, pdf_path.stem) + + # Run MinerU command + cmd = [ + "mineru", + "-p", str(pdf_path), + "-o", pdf_output_dir, + "-m", "txt", # Use text mode + "-f", "false", # Disable formula parsing for speed + "-t", "false", # Disable table parsing for speed + ] + + logger.info(f"Running command: {' '.join(cmd)}") + + # Execute MinerU + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + error_msg = result.stderr if result.stderr else "Unknown error" + return PdfConverterResult( + str(pdf_path), False, error=error_msg + ) + + # Find the generated markdown file + md_path = None + expected_md = Path(pdf_output_dir) / pdf_path.stem / "txt" / f"{pdf_path.stem}.md" + + if expected_md.exists(): + md_path = str(expected_md) + logger.info(f"✅ Markdown file created: {md_path}") + else: + # Search for any .md file in the output directory + for md_file in Path(pdf_output_dir).rglob("*.md"): + md_path = str(md_file) + logger.info(f"✅ Found markdown file: {md_path}") + break + + if not md_path: + return PdfConverterResult( + str(pdf_path), False, error="No markdown file generated" + ) + + # Delete original PDF if requested + if delete_after and pdf_path.exists(): + pdf_path.unlink() + logger.info(f"🗑️ Deleted original PDF: {pdf_path}") + + elapsed_time = time.time() - start_time + + return PdfConverterResult( + str(pdf_path), True, md_path=md_path, time_taken=elapsed_time + ) + + except Exception as e: + logger.error(f"Error processing {pdf_path}: {e}") + import traceback + traceback.print_exc() + + return PdfConverterResult( + str(pdf_path), False, error=str(e) + ) + + +class BatchProcessor: + """Process multiple PDF files in batch""" + + def __init__(self, batch_dir: str = "batch-files", output_dir: str = "output", + workers: int = 1, delete_after: bool = False): + self.batch_dir = batch_dir + self.output_dir = output_dir + self.workers = workers + self.delete_after = delete_after + self.converter = MineruPdfConverter(output_dir) + + def find_pdf_files(self) -> list[Path]: + """Find all PDF files in the batch directory""" + pdf_files = [] + batch_path = Path(self.batch_dir) + + if not batch_path.exists(): + logger.warning(f"Batch directory not found: {self.batch_dir}") + return pdf_files + + # Find all PDFs recursively + pdf_files = list(batch_path.rglob("*.pdf")) + logger.info(f"Found {len(pdf_files)} PDF files in {self.batch_dir}") + + return pdf_files + + def process_batch(self) -> tuple[int, int]: + """Process all PDFs in the batch directory""" + pdf_files = self.find_pdf_files() + + if not pdf_files: + logger.info("No PDF files found to process") + return 0, 0 + + successful = 0 + failed = 0 + + logger.info(f"Starting batch processing of {len(pdf_files)} files...") + + # Process files sequentially (MinerU already handles parallelism internally) + for pdf_file in pdf_files: + result = self.converter.convert_file(str(pdf_file), self.delete_after) + + if result.success: + successful += 1 + logger.info(f"✅ {result}") + else: + failed += 1 + logger.error(f"❌ {result}") + + return successful, failed + + +def main(): + """Main entry point""" + parser = argparse.ArgumentParser( + description="Convert PDF files to Markdown using MinerU", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Convert a single PDF + %(prog)s convert path/to/file.pdf + + # Batch convert all PDFs in batch-files directory + %(prog)s batch + + # Batch convert with custom settings + %(prog)s batch --batch-dir /path/to/pdfs --output-dir /path/to/output --workers 4 + + # Delete PDFs after successful conversion + %(prog)s batch --delete-after + """ + ) + + subparsers = parser.add_subparsers(dest='command', help='Command to run') + + # Convert command + convert_parser = subparsers.add_parser('convert', help='Convert a single PDF file') + convert_parser.add_argument('pdf_file', help='Path to PDF file') + convert_parser.add_argument('--output-dir', default='output', help='Output directory') + convert_parser.add_argument('--delete-after', action='store_true', + help='Delete PDF after successful conversion') + + # Batch command + batch_parser = subparsers.add_parser('batch', help='Batch convert PDF files') + batch_parser.add_argument('--batch-dir', default='batch-files', + help='Directory containing PDF files') + batch_parser.add_argument('--output-dir', default='output', + help='Output directory') + batch_parser.add_argument('--workers', type=int, default=1, + help='Number of parallel workers') + batch_parser.add_argument('--delete-after', action='store_true', + help='Delete PDFs after successful conversion') + + args = parser.parse_args() + + # Auto-detect command if none specified + if not args.command: + # If first argument looks like a file, assume convert command + if len(sys.argv) > 1 and (sys.argv[1].endswith('.pdf') or Path(sys.argv[1]).exists()): + args.command = 'convert' + args.pdf_file = sys.argv[1] + args.output_dir = 'output' + args.delete_after = False + else: + # Default to batch mode + args.command = 'batch' + args.batch_dir = 'batch-files' + args.output_dir = 'output' + args.workers = 1 + args.delete_after = False + + # Execute command + if args.command == 'convert': + converter = MineruPdfConverter(args.output_dir) + result = converter.convert_file(args.pdf_file, args.delete_after) + print(result) + sys.exit(0 if result.success else 1) + + elif args.command == 'batch': + processor = BatchProcessor( + args.batch_dir, + args.output_dir, + args.workers, + args.delete_after + ) + successful, failed = processor.process_batch() + + print(f"\n📊 Batch processing complete:") + print(f" ✅ Successful: {successful}") + print(f" ❌ Failed: {failed}") + print(f" 📁 Output directory: {args.output_dir}") + + sys.exit(0 if failed == 0 else 1) + + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 974e3e6d6770c5b409969b45104bd03707b9e304..0fd3b3d5a6c2885beb781572a536f96eccad829b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ fastapi==0.104.1 uvicorn==0.24.0 python-multipart==0.0.6 -aiofiles==23.2.1 \ No newline at end of file +aiofiles==23.2.1 + +# Basic PDF processing (will add MinerU later) +PyMuPDF>=1.18.16 \ No newline at end of file diff --git a/vendor/mineru/mineru/__init__.py b/vendor/mineru/mineru/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/backend/__init__.py b/vendor/mineru/mineru/backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/backend/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/backend/pipeline/__init__.py b/vendor/mineru/mineru/backend/pipeline/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/backend/pipeline/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/backend/pipeline/batch_analyze.py b/vendor/mineru/mineru/backend/pipeline/batch_analyze.py new file mode 100644 index 0000000000000000000000000000000000000000..ce68da45fa65e79225a819ed4b36a886c90d870e --- /dev/null +++ b/vendor/mineru/mineru/backend/pipeline/batch_analyze.py @@ -0,0 +1,331 @@ +import cv2 +from loguru import logger +from tqdm import tqdm +from collections import defaultdict +import numpy as np + +from .model_init import AtomModelSingleton +from ...utils.config_reader import get_formula_enable, get_table_enable +from ...utils.model_utils import crop_img, get_res_list_from_layout_res +from ...utils.ocr_utils import get_adjusted_mfdetrec_res, get_ocr_result_list, OcrConfidence + +YOLO_LAYOUT_BASE_BATCH_SIZE = 8 +MFD_BASE_BATCH_SIZE = 1 +MFR_BASE_BATCH_SIZE = 16 + + +class BatchAnalyze: + def __init__(self, model_manager, batch_ratio: int, formula_enable, table_enable, enable_ocr_det_batch: bool = True): + self.batch_ratio = batch_ratio + self.formula_enable = get_formula_enable(formula_enable) + self.table_enable = get_table_enable(table_enable) + self.model_manager = model_manager + self.enable_ocr_det_batch = enable_ocr_det_batch + + def __call__(self, images_with_extra_info: list) -> list: + if len(images_with_extra_info) == 0: + return [] + + images_layout_res = [] + + self.model = self.model_manager.get_model( + lang=None, + formula_enable=self.formula_enable, + table_enable=self.table_enable, + ) + atom_model_manager = AtomModelSingleton() + + images = [image for image, _, _ in images_with_extra_info] + + # doclayout_yolo + layout_images = [] + for image_index, image in enumerate(images): + layout_images.append(image) + + + images_layout_res += self.model.layout_model.batch_predict( + layout_images, YOLO_LAYOUT_BASE_BATCH_SIZE + ) + + if self.formula_enable: + # 公式检测 + images_mfd_res = self.model.mfd_model.batch_predict( + images, MFD_BASE_BATCH_SIZE + ) + + # 公式识别 + images_formula_list = self.model.mfr_model.batch_predict( + images_mfd_res, + images, + batch_size=self.batch_ratio * MFR_BASE_BATCH_SIZE, + ) + mfr_count = 0 + for image_index in range(len(images)): + images_layout_res[image_index] += images_formula_list[image_index] + mfr_count += len(images_formula_list[image_index]) + + # 清理显存 + # clean_vram(self.model.device, vram_threshold=8) + + ocr_res_list_all_page = [] + table_res_list_all_page = [] + for index in range(len(images)): + _, ocr_enable, _lang = images_with_extra_info[index] + layout_res = images_layout_res[index] + pil_img = images[index] + + ocr_res_list, table_res_list, single_page_mfdetrec_res = ( + get_res_list_from_layout_res(layout_res) + ) + + ocr_res_list_all_page.append({'ocr_res_list':ocr_res_list, + 'lang':_lang, + 'ocr_enable':ocr_enable, + 'pil_img':pil_img, + 'single_page_mfdetrec_res':single_page_mfdetrec_res, + 'layout_res':layout_res, + }) + + for table_res in table_res_list: + table_img, _ = crop_img(table_res, pil_img) + table_res_list_all_page.append({'table_res':table_res, + 'lang':_lang, + 'table_img':table_img, + }) + + # OCR检测处理 + if self.enable_ocr_det_batch: + # 批处理模式 - 按语言和分辨率分组 + # 收集所有需要OCR检测的裁剪图像 + all_cropped_images_info = [] + + for ocr_res_list_dict in ocr_res_list_all_page: + _lang = ocr_res_list_dict['lang'] + + for res in ocr_res_list_dict['ocr_res_list']: + new_image, useful_list = crop_img( + res, ocr_res_list_dict['pil_img'], crop_paste_x=50, crop_paste_y=50 + ) + adjusted_mfdetrec_res = get_adjusted_mfdetrec_res( + ocr_res_list_dict['single_page_mfdetrec_res'], useful_list + ) + + # BGR转换 + new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR) + + all_cropped_images_info.append(( + new_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang + )) + + # 按语言分组 + lang_groups = defaultdict(list) + for crop_info in all_cropped_images_info: + lang = crop_info[5] + lang_groups[lang].append(crop_info) + + # 对每种语言按分辨率分组并批处理 + for lang, lang_crop_list in lang_groups.items(): + if not lang_crop_list: + continue + + # logger.info(f"Processing OCR detection for language {lang} with {len(lang_crop_list)} images") + + # 获取OCR模型 + ocr_model = atom_model_manager.get_atom_model( + atom_model_name='ocr', + det_db_box_thresh=0.3, + lang=lang + ) + + # 按分辨率分组并同时完成padding + resolution_groups = defaultdict(list) + for crop_info in lang_crop_list: + cropped_img = crop_info[0] + h, w = cropped_img.shape[:2] + # 使用更大的分组容差,减少分组数量 + # 将尺寸标准化到32的倍数 + normalized_h = ((h + 32) // 32) * 32 # 向上取整到32的倍数 + normalized_w = ((w + 32) // 32) * 32 + group_key = (normalized_h, normalized_w) + resolution_groups[group_key].append(crop_info) + + # 对每个分辨率组进行批处理 + for group_key, group_crops in tqdm(resolution_groups.items(), desc=f"OCR-det {lang}"): + + # 计算目标尺寸(组内最大尺寸,向上取整到32的倍数) + max_h = max(crop_info[0].shape[0] for crop_info in group_crops) + max_w = max(crop_info[0].shape[1] for crop_info in group_crops) + target_h = ((max_h + 32 - 1) // 32) * 32 + target_w = ((max_w + 32 - 1) // 32) * 32 + + # 对所有图像进行padding到统一尺寸 + batch_images = [] + for crop_info in group_crops: + img = crop_info[0] + h, w = img.shape[:2] + # 创建目标尺寸的白色背景 + padded_img = np.ones((target_h, target_w, 3), dtype=np.uint8) * 255 + # 将原图像粘贴到左上角 + padded_img[:h, :w] = img + batch_images.append(padded_img) + + # 批处理检测 + batch_size = min(len(batch_images), self.batch_ratio * 16) # 增加批处理大小 + # logger.debug(f"OCR-det batch: {batch_size} images, target size: {target_h}x{target_w}") + batch_results = ocr_model.text_detector.batch_predict(batch_images, batch_size) + + # 处理批处理结果 + for i, (crop_info, (dt_boxes, elapse)) in enumerate(zip(group_crops, batch_results)): + new_image, useful_list, ocr_res_list_dict, res, adjusted_mfdetrec_res, _lang = crop_info + + if dt_boxes is not None and len(dt_boxes) > 0: + # 直接应用原始OCR流程中的关键处理步骤 + from mineru.utils.ocr_utils import ( + merge_det_boxes, update_det_boxes, sorted_boxes + ) + + # 1. 排序检测框 + if len(dt_boxes) > 0: + dt_boxes_sorted = sorted_boxes(dt_boxes) + else: + dt_boxes_sorted = [] + + # 2. 合并相邻检测框 + if dt_boxes_sorted: + dt_boxes_merged = merge_det_boxes(dt_boxes_sorted) + else: + dt_boxes_merged = [] + + # 3. 根据公式位置更新检测框(关键步骤!) + if dt_boxes_merged and adjusted_mfdetrec_res: + dt_boxes_final = update_det_boxes(dt_boxes_merged, adjusted_mfdetrec_res) + else: + dt_boxes_final = dt_boxes_merged + + # 构造OCR结果格式 + ocr_res = [box.tolist() if hasattr(box, 'tolist') else box for box in dt_boxes_final] + + if ocr_res: + ocr_result_list = get_ocr_result_list( + ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang + ) + + ocr_res_list_dict['layout_res'].extend(ocr_result_list) + else: + # 原始单张处理模式 + for ocr_res_list_dict in tqdm(ocr_res_list_all_page, desc="OCR-det Predict"): + # Process each area that requires OCR processing + _lang = ocr_res_list_dict['lang'] + # Get OCR results for this language's images + ocr_model = atom_model_manager.get_atom_model( + atom_model_name='ocr', + ocr_show_log=False, + det_db_box_thresh=0.3, + lang=_lang + ) + for res in ocr_res_list_dict['ocr_res_list']: + new_image, useful_list = crop_img( + res, ocr_res_list_dict['pil_img'], crop_paste_x=50, crop_paste_y=50 + ) + adjusted_mfdetrec_res = get_adjusted_mfdetrec_res( + ocr_res_list_dict['single_page_mfdetrec_res'], useful_list + ) + # OCR-det + new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR) + ocr_res = ocr_model.ocr( + new_image, mfd_res=adjusted_mfdetrec_res, rec=False + )[0] + + # Integration results + if ocr_res: + ocr_result_list = get_ocr_result_list( + ocr_res, useful_list, ocr_res_list_dict['ocr_enable'],new_image, _lang + ) + + ocr_res_list_dict['layout_res'].extend(ocr_result_list) + + # 表格识别 table recognition + if self.table_enable: + for table_res_dict in tqdm(table_res_list_all_page, desc="Table Predict"): + _lang = table_res_dict['lang'] + table_model = atom_model_manager.get_atom_model( + atom_model_name='table', + lang=_lang, + ) + html_code, table_cell_bboxes, logic_points, elapse = table_model.predict(table_res_dict['table_img']) + # 判断是否返回正常 + if html_code: + expected_ending = html_code.strip().endswith('') or html_code.strip().endswith('') + if expected_ending: + table_res_dict['table_res']['html'] = html_code + else: + logger.warning( + 'table recognition processing fails, not found expected HTML table end' + ) + else: + logger.warning( + 'table recognition processing fails, not get html return' + ) + + # Create dictionaries to store items by language + need_ocr_lists_by_lang = {} # Dict of lists for each language + img_crop_lists_by_lang = {} # Dict of lists for each language + + for layout_res in images_layout_res: + for layout_res_item in layout_res: + if layout_res_item['category_id'] in [15]: + if 'np_img' in layout_res_item and 'lang' in layout_res_item: + lang = layout_res_item['lang'] + + # Initialize lists for this language if not exist + if lang not in need_ocr_lists_by_lang: + need_ocr_lists_by_lang[lang] = [] + img_crop_lists_by_lang[lang] = [] + + # Add to the appropriate language-specific lists + need_ocr_lists_by_lang[lang].append(layout_res_item) + img_crop_lists_by_lang[lang].append(layout_res_item['np_img']) + + # Remove the fields after adding to lists + layout_res_item.pop('np_img') + layout_res_item.pop('lang') + + if len(img_crop_lists_by_lang) > 0: + + # Process OCR by language + total_processed = 0 + + # Process each language separately + for lang, img_crop_list in img_crop_lists_by_lang.items(): + if len(img_crop_list) > 0: + # Get OCR results for this language's images + + ocr_model = atom_model_manager.get_atom_model( + atom_model_name='ocr', + det_db_box_thresh=0.3, + lang=lang + ) + ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0] + + # Verify we have matching counts + assert len(ocr_res_list) == len( + need_ocr_lists_by_lang[lang]), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_lists_by_lang[lang])} for lang: {lang}' + + # Process OCR results for this language + for index, layout_res_item in enumerate(need_ocr_lists_by_lang[lang]): + ocr_text, ocr_score = ocr_res_list[index] + layout_res_item['text'] = ocr_text + layout_res_item['score'] = float(f"{ocr_score:.3f}") + if ocr_score < OcrConfidence.min_confidence: + layout_res_item['category_id'] = 16 + else: + layout_res_bbox = [layout_res_item['poly'][0], layout_res_item['poly'][1], + layout_res_item['poly'][4], layout_res_item['poly'][5]] + layout_res_width = layout_res_bbox[2] - layout_res_bbox[0] + layout_res_height = layout_res_bbox[3] - layout_res_bbox[1] + if ocr_text in ['(204号', '(20', '(2', '(2号', '(20号'] and ocr_score < 0.8 and layout_res_width < layout_res_height: + layout_res_item['category_id'] = 16 + + total_processed += len(img_crop_list) + + return images_layout_res diff --git a/vendor/mineru/mineru/backend/pipeline/model_init.py b/vendor/mineru/mineru/backend/pipeline/model_init.py new file mode 100644 index 0000000000000000000000000000000000000000..ac6f2e8de0d03a3045f7019bb07607b3e48eed0b --- /dev/null +++ b/vendor/mineru/mineru/backend/pipeline/model_init.py @@ -0,0 +1,182 @@ +import os + +import torch +from loguru import logger + +from .model_list import AtomicModel +from ...model.layout.doclayout_yolo import DocLayoutYOLOModel +from ...model.mfd.yolo_v8 import YOLOv8MFDModel +from ...model.mfr.unimernet.Unimernet import UnimernetModel +from ...model.ocr.paddleocr2pytorch.pytorch_paddle import PytorchPaddleOCR +from ...model.table.rapid_table import RapidTableModel +from ...utils.enum_class import ModelPath +from ...utils.models_download_utils import auto_download_and_get_model_root_path + + +def table_model_init(lang=None): + atom_model_manager = AtomModelSingleton() + ocr_engine = atom_model_manager.get_atom_model( + atom_model_name='ocr', + det_db_box_thresh=0.5, + det_db_unclip_ratio=1.6, + lang=lang + ) + table_model = RapidTableModel(ocr_engine) + return table_model + + +def mfd_model_init(weight, device='cpu'): + if str(device).startswith('npu'): + device = torch.device(device) + mfd_model = YOLOv8MFDModel(weight, device) + return mfd_model + + +def mfr_model_init(weight_dir, device='cpu'): + mfr_model = UnimernetModel(weight_dir, device) + return mfr_model + + +def doclayout_yolo_model_init(weight, device='cpu'): + if str(device).startswith('npu'): + device = torch.device(device) + model = DocLayoutYOLOModel(weight, device) + return model + +def ocr_model_init(det_db_box_thresh=0.3, + lang=None, + use_dilation=True, + det_db_unclip_ratio=1.8, + ): + if lang is not None and lang != '': + model = PytorchPaddleOCR( + det_db_box_thresh=det_db_box_thresh, + lang=lang, + use_dilation=use_dilation, + det_db_unclip_ratio=det_db_unclip_ratio, + ) + else: + model = PytorchPaddleOCR( + det_db_box_thresh=det_db_box_thresh, + use_dilation=use_dilation, + det_db_unclip_ratio=det_db_unclip_ratio, + ) + return model + + +class AtomModelSingleton: + _instance = None + _models = {} + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def get_atom_model(self, atom_model_name: str, **kwargs): + + lang = kwargs.get('lang', None) + table_model_name = kwargs.get('table_model_name', None) + + if atom_model_name in [AtomicModel.OCR]: + key = (atom_model_name, lang) + elif atom_model_name in [AtomicModel.Table]: + key = (atom_model_name, table_model_name, lang) + else: + key = atom_model_name + + if key not in self._models: + self._models[key] = atom_model_init(model_name=atom_model_name, **kwargs) + return self._models[key] + +def atom_model_init(model_name: str, **kwargs): + atom_model = None + if model_name == AtomicModel.Layout: + atom_model = doclayout_yolo_model_init( + kwargs.get('doclayout_yolo_weights'), + kwargs.get('device') + ) + elif model_name == AtomicModel.MFD: + atom_model = mfd_model_init( + kwargs.get('mfd_weights'), + kwargs.get('device') + ) + elif model_name == AtomicModel.MFR: + atom_model = mfr_model_init( + kwargs.get('mfr_weight_dir'), + kwargs.get('device') + ) + elif model_name == AtomicModel.OCR: + atom_model = ocr_model_init( + kwargs.get('det_db_box_thresh'), + kwargs.get('lang'), + ) + elif model_name == AtomicModel.Table: + atom_model = table_model_init( + kwargs.get('lang'), + ) + else: + logger.error('model name not allow') + exit(1) + + if atom_model is None: + logger.error('model init failed') + exit(1) + else: + return atom_model + + +class MineruPipelineModel: + def __init__(self, **kwargs): + self.formula_config = kwargs.get('formula_config') + self.apply_formula = self.formula_config.get('enable', True) + self.table_config = kwargs.get('table_config') + self.apply_table = self.table_config.get('enable', True) + self.lang = kwargs.get('lang', None) + self.device = kwargs.get('device', 'cpu') + logger.info( + 'DocAnalysis init, this may take some times......' + ) + atom_model_manager = AtomModelSingleton() + + if self.apply_formula: + # 初始化公式检测模型 + self.mfd_model = atom_model_manager.get_atom_model( + atom_model_name=AtomicModel.MFD, + mfd_weights=str( + os.path.join(auto_download_and_get_model_root_path(ModelPath.yolo_v8_mfd), ModelPath.yolo_v8_mfd) + ), + device=self.device, + ) + + # 初始化公式解析模型 + mfr_weight_dir = os.path.join(auto_download_and_get_model_root_path(ModelPath.unimernet_small), ModelPath.unimernet_small) + + self.mfr_model = atom_model_manager.get_atom_model( + atom_model_name=AtomicModel.MFR, + mfr_weight_dir=mfr_weight_dir, + device=self.device, + ) + + # 初始化layout模型 + self.layout_model = atom_model_manager.get_atom_model( + atom_model_name=AtomicModel.Layout, + doclayout_yolo_weights=str( + os.path.join(auto_download_and_get_model_root_path(ModelPath.doclayout_yolo), ModelPath.doclayout_yolo) + ), + device=self.device, + ) + # 初始化ocr + self.ocr_model = atom_model_manager.get_atom_model( + atom_model_name=AtomicModel.OCR, + det_db_box_thresh=0.3, + lang=self.lang + ) + # init table model + if self.apply_table: + self.table_model = atom_model_manager.get_atom_model( + atom_model_name=AtomicModel.Table, + lang=self.lang, + ) + + logger.info('DocAnalysis init done!') \ No newline at end of file diff --git a/vendor/mineru/mineru/backend/pipeline/model_json_to_middle_json.py b/vendor/mineru/mineru/backend/pipeline/model_json_to_middle_json.py new file mode 100644 index 0000000000000000000000000000000000000000..7a8b04bf151ca3a6a2e6dd6d4dc6292d1544098a --- /dev/null +++ b/vendor/mineru/mineru/backend/pipeline/model_json_to_middle_json.py @@ -0,0 +1,249 @@ +# Copyright (c) Opendatalab. All rights reserved. +import os +import time + +from loguru import logger +from tqdm import tqdm + +from mineru.utils.config_reader import get_device, get_llm_aided_config, get_formula_enable +from mineru.backend.pipeline.model_init import AtomModelSingleton +from mineru.backend.pipeline.para_split import para_split +from mineru.utils.block_pre_proc import prepare_block_bboxes, process_groups +from mineru.utils.block_sort import sort_blocks_by_bbox +from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio +from mineru.utils.cut_image import cut_image_and_table +from mineru.utils.enum_class import ContentType +from mineru.utils.llm_aided import llm_aided_title +from mineru.utils.model_utils import clean_memory +from mineru.backend.pipeline.pipeline_magic_model import MagicModel +from mineru.utils.ocr_utils import OcrConfidence +from mineru.utils.span_block_fix import fill_spans_in_blocks, fix_discarded_block, fix_block_spans +from mineru.utils.span_pre_proc import remove_outside_spans, remove_overlaps_low_confidence_spans, \ + remove_overlaps_min_spans, txt_spans_extract +from mineru.version import __version__ +from mineru.utils.hash_utils import str_md5 + + +def page_model_info_to_page_info(page_model_info, image_dict, page, image_writer, page_index, ocr_enable=False, formula_enabled=True): + scale = image_dict["scale"] + page_pil_img = image_dict["img_pil"] + page_img_md5 = str_md5(image_dict["img_base64"]) + page_w, page_h = map(int, page.get_size()) + magic_model = MagicModel(page_model_info, scale) + + """从magic_model对象中获取后面会用到的区块信息""" + discarded_blocks = magic_model.get_discarded() + text_blocks = magic_model.get_text_blocks() + title_blocks = magic_model.get_title_blocks() + inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations() + + img_groups = magic_model.get_imgs() + table_groups = magic_model.get_tables() + + """对image和table的区块分组""" + img_body_blocks, img_caption_blocks, img_footnote_blocks, maybe_text_image_blocks = process_groups( + img_groups, 'image_body', 'image_caption_list', 'image_footnote_list' + ) + + table_body_blocks, table_caption_blocks, table_footnote_blocks, _ = process_groups( + table_groups, 'table_body', 'table_caption_list', 'table_footnote_list' + ) + + """获取所有的spans信息""" + spans = magic_model.get_all_spans() + + """某些图可能是文本块,通过简单的规则判断一下""" + if len(maybe_text_image_blocks) > 0: + for block in maybe_text_image_blocks: + span_in_block_list = [] + for span in spans: + if span['type'] == 'text' and calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block['bbox']) > 0.7: + span_in_block_list.append(span) + if len(span_in_block_list) > 0: + # span_in_block_list中所有bbox的面积之和 + spans_area = sum((span['bbox'][2] - span['bbox'][0]) * (span['bbox'][3] - span['bbox'][1]) for span in span_in_block_list) + # 求ocr_res_area和res的面积的比值 + block_area = (block['bbox'][2] - block['bbox'][0]) * (block['bbox'][3] - block['bbox'][1]) + if block_area > 0: + ratio = spans_area / block_area + if ratio > 0.25 and ocr_enable: + # 移除block的group_id + block.pop('group_id', None) + # 符合文本图的条件就把块加入到文本块列表中 + text_blocks.append(block) + else: + # 如果不符合文本图的条件,就把块加回到图片块列表中 + img_body_blocks.append(block) + else: + img_body_blocks.append(block) + + + """将所有区块的bbox整理到一起""" + if formula_enabled: + interline_equation_blocks = [] + + if len(interline_equation_blocks) > 0: + + for block in interline_equation_blocks: + spans.append({ + "type": ContentType.INTERLINE_EQUATION, + 'score': block['score'], + "bbox": block['bbox'], + }) + + all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes( + img_body_blocks, img_caption_blocks, img_footnote_blocks, + table_body_blocks, table_caption_blocks, table_footnote_blocks, + discarded_blocks, + text_blocks, + title_blocks, + interline_equation_blocks, + page_w, + page_h, + ) + else: + all_bboxes, all_discarded_blocks, footnote_blocks = prepare_block_bboxes( + img_body_blocks, img_caption_blocks, img_footnote_blocks, + table_body_blocks, table_caption_blocks, table_footnote_blocks, + discarded_blocks, + text_blocks, + title_blocks, + interline_equations, + page_w, + page_h, + ) + + """在删除重复span之前,应该通过image_body和table_body的block过滤一下image和table的span""" + """顺便删除大水印并保留abandon的span""" + spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks) + + """删除重叠spans中置信度较低的那些""" + spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans) + """删除重叠spans中较小的那些""" + spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans) + + """根据parse_mode,构造spans,主要是文本类的字符填充""" + if ocr_enable: + pass + else: + """使用新版本的混合ocr方案.""" + spans = txt_spans_extract(page, spans, page_pil_img, scale, all_bboxes, all_discarded_blocks) + + """先处理不需要排版的discarded_blocks""" + discarded_block_with_spans, spans = fill_spans_in_blocks( + all_discarded_blocks, spans, 0.4 + ) + fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans) + + """如果当前页面没有有效的bbox则跳过""" + if len(all_bboxes) == 0: + return None + + """对image/table/interline_equation截图""" + for span in spans: + if span['type'] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]: + span = cut_image_and_table( + span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale + ) + + """span填充进block""" + block_with_spans, spans = fill_spans_in_blocks(all_bboxes, spans, 0.5) + + """对block进行fix操作""" + fix_blocks = fix_block_spans(block_with_spans) + + """对block进行排序""" + sorted_blocks = sort_blocks_by_bbox(fix_blocks, page_w, page_h, footnote_blocks) + + """构造page_info""" + page_info = make_page_info_dict(sorted_blocks, page_index, page_w, page_h, fix_discarded_blocks) + + return page_info + + +def result_to_middle_json(model_list, images_list, pdf_doc, image_writer, lang=None, ocr_enable=False, formula_enabled=True): + middle_json = {"pdf_info": [], "_backend":"pipeline", "_version_name": __version__} + formula_enabled = get_formula_enable(formula_enabled) + for page_index, page_model_info in tqdm(enumerate(model_list), total=len(model_list), desc="Processing pages"): + page = pdf_doc[page_index] + image_dict = images_list[page_index] + page_info = page_model_info_to_page_info( + page_model_info, image_dict, page, image_writer, page_index, ocr_enable=ocr_enable, formula_enabled=formula_enabled + ) + if page_info is None: + page_w, page_h = map(int, page.get_size()) + page_info = make_page_info_dict([], page_index, page_w, page_h, []) + middle_json["pdf_info"].append(page_info) + + """后置ocr处理""" + need_ocr_list = [] + img_crop_list = [] + text_block_list = [] + for page_info in middle_json["pdf_info"]: + for block in page_info['preproc_blocks']: + if block['type'] in ['table', 'image']: + for sub_block in block['blocks']: + if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']: + text_block_list.append(sub_block) + elif block['type'] in ['text', 'title']: + text_block_list.append(block) + for block in page_info['discarded_blocks']: + text_block_list.append(block) + for block in text_block_list: + for line in block['lines']: + for span in line['spans']: + if 'np_img' in span: + need_ocr_list.append(span) + img_crop_list.append(span['np_img']) + span.pop('np_img') + if len(img_crop_list) > 0: + atom_model_manager = AtomModelSingleton() + ocr_model = atom_model_manager.get_atom_model( + atom_model_name='ocr', + ocr_show_log=False, + det_db_box_thresh=0.3, + lang=lang + ) + ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0] + assert len(ocr_res_list) == len( + need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}' + for index, span in enumerate(need_ocr_list): + ocr_text, ocr_score = ocr_res_list[index] + if ocr_score > OcrConfidence.min_confidence: + span['content'] = ocr_text + span['score'] = float(f"{ocr_score:.3f}") + else: + span['content'] = '' + span['score'] = 0.0 + + """分段""" + para_split(middle_json["pdf_info"]) + + """llm优化""" + llm_aided_config = get_llm_aided_config() + + if llm_aided_config is not None: + """标题优化""" + title_aided_config = llm_aided_config.get('title_aided', None) + if title_aided_config is not None: + if title_aided_config.get('enable', False): + llm_aided_title_start_time = time.time() + llm_aided_title(middle_json["pdf_info"], title_aided_config) + logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}') + + """清理内存""" + pdf_doc.close() + if os.getenv('MINERU_DONOT_CLEAN_MEM') is None and len(model_list) >= 10: + clean_memory(get_device()) + + return middle_json + + +def make_page_info_dict(blocks, page_id, page_w, page_h, discarded_blocks): + return_dict = { + 'preproc_blocks': blocks, + 'page_idx': page_id, + 'page_size': [page_w, page_h], + 'discarded_blocks': discarded_blocks, + } + return return_dict \ No newline at end of file diff --git a/vendor/mineru/mineru/backend/pipeline/model_list.py b/vendor/mineru/mineru/backend/pipeline/model_list.py new file mode 100644 index 0000000000000000000000000000000000000000..3676ae6709b7eeea746470f2233e0772a6bf62bf --- /dev/null +++ b/vendor/mineru/mineru/backend/pipeline/model_list.py @@ -0,0 +1,6 @@ +class AtomicModel: + Layout = "layout" + MFD = "mfd" + MFR = "mfr" + OCR = "ocr" + Table = "table" diff --git a/vendor/mineru/mineru/backend/pipeline/para_split.py b/vendor/mineru/mineru/backend/pipeline/para_split.py new file mode 100644 index 0000000000000000000000000000000000000000..513311ff408f482d45fffe06cf7bc4658cce6e48 --- /dev/null +++ b/vendor/mineru/mineru/backend/pipeline/para_split.py @@ -0,0 +1,381 @@ +import copy +from loguru import logger +from mineru.utils.enum_class import ContentType, BlockType, SplitFlag +from mineru.utils.language import detect_lang + + +LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';') +LIST_END_FLAG = ('.', '。', ';', ';') + + +class ListLineTag: + IS_LIST_START_LINE = 'is_list_start_line' + IS_LIST_END_LINE = 'is_list_end_line' + + +def __process_blocks(blocks): + # 对所有block预处理 + # 1.通过title和interline_equation将block分组 + # 2.bbox边界根据line信息重置 + + result = [] + current_group = [] + + for i in range(len(blocks)): + current_block = blocks[i] + + # 如果当前块是 text 类型 + if current_block['type'] == 'text': + current_block['bbox_fs'] = copy.deepcopy(current_block['bbox']) + if 'lines' in current_block and len(current_block['lines']) > 0: + current_block['bbox_fs'] = [ + min([line['bbox'][0] for line in current_block['lines']]), + min([line['bbox'][1] for line in current_block['lines']]), + max([line['bbox'][2] for line in current_block['lines']]), + max([line['bbox'][3] for line in current_block['lines']]), + ] + current_group.append(current_block) + + # 检查下一个块是否存在 + if i + 1 < len(blocks): + next_block = blocks[i + 1] + # 如果下一个块不是 text 类型且是 title 或 interline_equation 类型 + if next_block['type'] in ['title', 'interline_equation']: + result.append(current_group) + current_group = [] + + # 处理最后一个 group + if current_group: + result.append(current_group) + + return result + + +def __is_list_or_index_block(block): + # 一个block如果是list block 应该同时满足以下特征 + # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 右侧不顶格(狗牙状) + # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.多个line以endflag结尾 + # 1.block内有多个line 2.block 内有多个line左侧顶格写 3.block内有多个line 左侧不顶格 + + # index block 是一种特殊的list block + # 一个block如果是index block 应该同时满足以下特征 + # 1.block内有多个line 2.block 内有多个line两侧均顶格写 3.line的开头或者结尾均为数字 + if len(block['lines']) >= 2: + first_line = block['lines'][0] + line_height = first_line['bbox'][3] - first_line['bbox'][1] + block_weight = block['bbox_fs'][2] - block['bbox_fs'][0] + block_height = block['bbox_fs'][3] - block['bbox_fs'][1] + page_weight, page_height = block['page_size'] + + left_close_num = 0 + left_not_close_num = 0 + right_not_close_num = 0 + right_close_num = 0 + lines_text_list = [] + center_close_num = 0 + external_sides_not_close_num = 0 + multiple_para_flag = False + last_line = block['lines'][-1] + + if page_weight == 0: + block_weight_radio = 0 + else: + block_weight_radio = block_weight / page_weight + # logger.info(f"block_weight_radio: {block_weight_radio}") + + # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 (第一行可能可以右边不顶格) + if ( + first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 + and abs(last_line['bbox'][0] - block['bbox_fs'][0]) < line_height / 2 + and block['bbox_fs'][2] - last_line['bbox'][2] > line_height + ): + multiple_para_flag = True + + block_text = '' + + for line in block['lines']: + line_text = '' + + for span in line['spans']: + span_type = span['type'] + if span_type == ContentType.TEXT: + line_text += span['content'].strip() + # 添加所有文本,包括空行,保持与block['lines']长度一致 + lines_text_list.append(line_text) + block_text = ''.join(lines_text_list) + + block_lang = detect_lang(block_text) + # logger.info(f"block_lang: {block_lang}") + + for line in block['lines']: + line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2 + block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2 + if ( + line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height + and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height + ): + external_sides_not_close_num += 1 + if abs(line_mid_x - block_mid_x) < line_height / 2: + center_close_num += 1 + + # 计算line左侧顶格数量是否大于2,是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断 + if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: + left_close_num += 1 + elif line['bbox'][0] - block['bbox_fs'][0] > line_height: + left_not_close_num += 1 + + # 计算右侧是否顶格 + if abs(block['bbox_fs'][2] - line['bbox'][2]) < line_height: + right_close_num += 1 + else: + # 类中文没有超长单词的情况,可以用统一的阈值 + if block_lang in ['zh', 'ja', 'ko']: + closed_area = 0.26 * block_weight + else: + # 右侧不顶格情况下是否有一段距离,拍脑袋用0.3block宽度做阈值 + # block宽的阈值可以小些,block窄的阈值要大 + if block_weight_radio >= 0.5: + closed_area = 0.26 * block_weight + else: + closed_area = 0.36 * block_weight + if block['bbox_fs'][2] - line['bbox'][2] > closed_area: + right_not_close_num += 1 + + # 判断lines_text_list中的元素是否有超过80%都以LIST_END_FLAG结尾 + line_end_flag = False + # 判断lines_text_list中的元素是否有超过80%都以数字开头或都以数字结尾 + line_num_flag = False + num_start_count = 0 + num_end_count = 0 + flag_end_count = 0 + + if len(lines_text_list) > 0: + for line_text in lines_text_list: + if len(line_text) > 0: + if line_text[-1] in LIST_END_FLAG: + flag_end_count += 1 + if line_text[0].isdigit(): + num_start_count += 1 + if line_text[-1].isdigit(): + num_end_count += 1 + + if ( + num_start_count / len(lines_text_list) >= 0.8 + or num_end_count / len(lines_text_list) >= 0.8 + ): + line_num_flag = True + if flag_end_count / len(lines_text_list) >= 0.8: + line_end_flag = True + + # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边,且符合数字规则极为index + if ( + left_close_num / len(block['lines']) >= 0.8 + or right_close_num / len(block['lines']) >= 0.8 + ) and line_num_flag: + for line in block['lines']: + line[ListLineTag.IS_LIST_START_LINE] = True + return BlockType.INDEX + + # 全部line都居中的特殊list识别,每行都需要换行,特征是多行,且大多数行都前后not_close,每line中点x坐标接近 + # 补充条件block的长宽比有要求 + elif ( + external_sides_not_close_num >= 2 + and center_close_num == len(block['lines']) + and external_sides_not_close_num / len(block['lines']) >= 0.5 + and block_height / block_weight > 0.4 + ): + for line in block['lines']: + line[ListLineTag.IS_LIST_START_LINE] = True + return BlockType.LIST + + elif ( + left_close_num >= 2 + and (right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) + and not multiple_para_flag + # and block_weight_radio > 0.27 + ): + # 处理一种特殊的没有缩进的list,所有行都贴左边,通过右边的空隙判断是否是item尾 + if left_close_num / len(block['lines']) > 0.8: + # 这种是每个item只有一行,且左边都贴边的短item list + if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5: + for line in block['lines']: + if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: + line[ListLineTag.IS_LIST_START_LINE] = True + # 这种是大部分line item 都有结束标识符的情况,按结束标识符区分不同item + elif line_end_flag: + for i, line in enumerate(block['lines']): + if ( + len(lines_text_list[i]) > 0 + and lines_text_list[i][-1] in LIST_END_FLAG + ): + line[ListLineTag.IS_LIST_END_LINE] = True + if i + 1 < len(block['lines']): + block['lines'][i + 1][ + ListLineTag.IS_LIST_START_LINE + ] = True + # line item基本没有结束标识符,而且也没有缩进,按右侧空隙判断哪些是item end + else: + line_start_flag = False + for i, line in enumerate(block['lines']): + if line_start_flag: + line[ListLineTag.IS_LIST_START_LINE] = True + line_start_flag = False + + if ( + abs(block['bbox_fs'][2] - line['bbox'][2]) + > 0.1 * block_weight + ): + line[ListLineTag.IS_LIST_END_LINE] = True + line_start_flag = True + # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头,end line 以 IS_LIST_END_FLAG 结尾且数量和start line 一致 + elif num_start_count >= 2 and num_start_count == flag_end_count: + for i, line in enumerate(block['lines']): + if len(lines_text_list[i]) > 0: + if lines_text_list[i][0].isdigit(): + line[ListLineTag.IS_LIST_START_LINE] = True + if lines_text_list[i][-1] in LIST_END_FLAG: + line[ListLineTag.IS_LIST_END_LINE] = True + else: + # 正常有缩进的list处理 + for line in block['lines']: + if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2: + line[ListLineTag.IS_LIST_START_LINE] = True + if abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height: + line[ListLineTag.IS_LIST_END_LINE] = True + + return BlockType.LIST + else: + return BlockType.TEXT + else: + return BlockType.TEXT + + +def __merge_2_text_blocks(block1, block2): + if len(block1['lines']) > 0: + first_line = block1['lines'][0] + line_height = first_line['bbox'][3] - first_line['bbox'][1] + block1_weight = block1['bbox'][2] - block1['bbox'][0] + block2_weight = block2['bbox'][2] - block2['bbox'][0] + min_block_weight = min(block1_weight, block2_weight) + if abs(block1['bbox_fs'][0] - first_line['bbox'][0]) < line_height / 2: + last_line = block2['lines'][-1] + if len(last_line['spans']) > 0: + last_span = last_line['spans'][-1] + line_height = last_line['bbox'][3] - last_line['bbox'][1] + if len(first_line['spans']) > 0: + first_span = first_line['spans'][0] + if len(first_span['content']) > 0: + span_start_with_num = first_span['content'][0].isdigit() + span_start_with_big_char = first_span['content'][0].isupper() + if ( + # 上一个block的最后一个line的右边界和block的右边界差距不超过line_height + abs(block2['bbox_fs'][2] - last_line['bbox'][2]) < line_height + # 上一个block的最后一个span不是以特定符号结尾 + and not last_span['content'].endswith(LINE_STOP_FLAG) + # 两个block宽度差距超过2倍也不合并 + and abs(block1_weight - block2_weight) < min_block_weight + # 下一个block的第一个字符是数字 + and not span_start_with_num + # 下一个block的第一个字符是大写字母 + and not span_start_with_big_char + ): + if block1['page_num'] != block2['page_num']: + for line in block1['lines']: + for span in line['spans']: + span[SplitFlag.CROSS_PAGE] = True + block2['lines'].extend(block1['lines']) + block1['lines'] = [] + block1[SplitFlag.LINES_DELETED] = True + + return block1, block2 + + +def __merge_2_list_blocks(block1, block2): + if block1['page_num'] != block2['page_num']: + for line in block1['lines']: + for span in line['spans']: + span[SplitFlag.CROSS_PAGE] = True + block2['lines'].extend(block1['lines']) + block1['lines'] = [] + block1[SplitFlag.LINES_DELETED] = True + + return block1, block2 + + +def __is_list_group(text_blocks_group): + # list group的特征是一个group内的所有block都满足以下条件 + # 1.每个block都不超过3行 2. 每个block 的左边界都比较接近(逻辑简单点先不加这个规则) + for block in text_blocks_group: + if len(block['lines']) > 3: + return False + return True + + +def __para_merge_page(blocks): + page_text_blocks_groups = __process_blocks(blocks) + for text_blocks_group in page_text_blocks_groups: + if len(text_blocks_group) > 0: + # 需要先在合并前对所有block判断是否为list or index block + for block in text_blocks_group: + block_type = __is_list_or_index_block(block) + block['type'] = block_type + # logger.info(f"{block['type']}:{block}") + + if len(text_blocks_group) > 1: + # 在合并前判断这个group 是否是一个 list group + is_list_group = __is_list_group(text_blocks_group) + + # 倒序遍历 + for i in range(len(text_blocks_group) - 1, -1, -1): + current_block = text_blocks_group[i] + + # 检查是否有前一个块 + if i - 1 >= 0: + prev_block = text_blocks_group[i - 1] + + if ( + current_block['type'] == 'text' + and prev_block['type'] == 'text' + and not is_list_group + ): + __merge_2_text_blocks(current_block, prev_block) + elif ( + current_block['type'] == BlockType.LIST + and prev_block['type'] == BlockType.LIST + ) or ( + current_block['type'] == BlockType.INDEX + and prev_block['type'] == BlockType.INDEX + ): + __merge_2_list_blocks(current_block, prev_block) + + else: + continue + + +def para_split(page_info_list): + all_blocks = [] + for page_info in page_info_list: + blocks = copy.deepcopy(page_info['preproc_blocks']) + for block in blocks: + block['page_num'] = page_info['page_idx'] + block['page_size'] = page_info['page_size'] + all_blocks.extend(blocks) + + __para_merge_page(all_blocks) + for page_info in page_info_list: + page_info['para_blocks'] = [] + for block in all_blocks: + if 'page_num' in block: + if block['page_num'] == page_info['page_idx']: + page_info['para_blocks'].append(block) + # 从block中删除不需要的page_num和page_size字段 + del block['page_num'] + del block['page_size'] + + +if __name__ == '__main__': + input_blocks = [] + # 调用函数 + groups = __process_blocks(input_blocks) + for group_index, group in enumerate(groups): + print(f'Group {group_index}: {group}') diff --git a/vendor/mineru/mineru/backend/pipeline/pipeline_analyze.py b/vendor/mineru/mineru/backend/pipeline/pipeline_analyze.py new file mode 100644 index 0000000000000000000000000000000000000000..43ba355789ac91039a47e36b1f5986af96210e2b --- /dev/null +++ b/vendor/mineru/mineru/backend/pipeline/pipeline_analyze.py @@ -0,0 +1,198 @@ +import os +import time +from typing import List, Tuple +import PIL.Image +from loguru import logger + +from .model_init import MineruPipelineModel +from mineru.utils.config_reader import get_device +from ...utils.pdf_classify import classify +from ...utils.pdf_image_tools import load_images_from_pdf +from ...utils.model_utils import get_vram, clean_memory + + +os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 让mps可以fallback +os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新 + +class ModelSingleton: + _instance = None + _models = {} + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def get_model( + self, + lang=None, + formula_enable=None, + table_enable=None, + ): + key = (lang, formula_enable, table_enable) + if key not in self._models: + self._models[key] = custom_model_init( + lang=lang, + formula_enable=formula_enable, + table_enable=table_enable, + ) + return self._models[key] + + +def custom_model_init( + lang=None, + formula_enable=True, + table_enable=True, +): + model_init_start = time.time() + # 从配置文件读取model-dir和device + device = get_device() + + formula_config = {"enable": formula_enable} + table_config = {"enable": table_enable} + + model_input = { + 'device': device, + 'table_config': table_config, + 'formula_config': formula_config, + 'lang': lang, + } + + custom_model = MineruPipelineModel(**model_input) + + model_init_cost = time.time() - model_init_start + logger.info(f'model init cost: {model_init_cost}') + + return custom_model + + +def doc_analyze( + pdf_bytes_list, + lang_list, + parse_method: str = 'auto', + formula_enable=True, + table_enable=True, +): + """ + 适当调大MIN_BATCH_INFERENCE_SIZE可以提高性能,可能会增加显存使用量, + 可通过环境变量MINERU_MIN_BATCH_INFERENCE_SIZE设置,默认值为128。 + """ + min_batch_inference_size = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 128)) + + # 收集所有页面信息 + all_pages_info = [] # 存储(dataset_index, page_index, img, ocr, lang, width, height) + + all_image_lists = [] + all_pdf_docs = [] + ocr_enabled_list = [] + for pdf_idx, pdf_bytes in enumerate(pdf_bytes_list): + # 确定OCR设置 + _ocr_enable = False + if parse_method == 'auto': + if classify(pdf_bytes) == 'ocr': + _ocr_enable = True + elif parse_method == 'ocr': + _ocr_enable = True + + ocr_enabled_list.append(_ocr_enable) + _lang = lang_list[pdf_idx] + + # 收集每个数据集中的页面 + images_list, pdf_doc = load_images_from_pdf(pdf_bytes) + all_image_lists.append(images_list) + all_pdf_docs.append(pdf_doc) + for page_idx in range(len(images_list)): + img_dict = images_list[page_idx] + all_pages_info.append(( + pdf_idx, page_idx, + img_dict['img_pil'], _ocr_enable, _lang, + )) + + # 准备批处理 + images_with_extra_info = [(info[2], info[3], info[4]) for info in all_pages_info] + batch_size = min_batch_inference_size + batch_images = [ + images_with_extra_info[i:i + batch_size] + for i in range(0, len(images_with_extra_info), batch_size) + ] + + # 执行批处理 + results = [] + processed_images_count = 0 + for index, batch_image in enumerate(batch_images): + processed_images_count += len(batch_image) + logger.info( + f'Batch {index + 1}/{len(batch_images)}: ' + f'{processed_images_count} pages/{len(images_with_extra_info)} pages' + ) + batch_results = batch_image_analyze(batch_image, formula_enable, table_enable) + results.extend(batch_results) + + # 构建返回结果 + infer_results = [] + + for _ in range(len(pdf_bytes_list)): + infer_results.append([]) + + for i, page_info in enumerate(all_pages_info): + pdf_idx, page_idx, pil_img, _, _ = page_info + result = results[i] + + page_info_dict = {'page_no': page_idx, 'width': pil_img.width, 'height': pil_img.height} + page_dict = {'layout_dets': result, 'page_info': page_info_dict} + + infer_results[pdf_idx].append(page_dict) + + return infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list + + +def batch_image_analyze( + images_with_extra_info: List[Tuple[PIL.Image.Image, bool, str]], + formula_enable=True, + table_enable=True): + # os.environ['CUDA_VISIBLE_DEVICES'] = str(idx) + + from .batch_analyze import BatchAnalyze + + model_manager = ModelSingleton() + + batch_ratio = 1 + device = get_device() + + if str(device).startswith('npu'): + try: + import torch_npu + if torch_npu.npu.is_available(): + torch_npu.npu.set_compile_mode(jit_compile=False) + except Exception as e: + raise RuntimeError( + "NPU is selected as device, but torch_npu is not available. " + "Please ensure that the torch_npu package is installed correctly." + ) from e + + if str(device).startswith('npu') or str(device).startswith('cuda'): + vram = get_vram(device) + if vram is not None: + gpu_memory = int(os.getenv('MINERU_VIRTUAL_VRAM_SIZE', round(vram))) + if gpu_memory >= 16: + batch_ratio = 16 + elif gpu_memory >= 12: + batch_ratio = 8 + elif gpu_memory >= 8: + batch_ratio = 4 + elif gpu_memory >= 6: + batch_ratio = 2 + else: + batch_ratio = 1 + logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}') + else: + # Default batch_ratio when VRAM can't be determined + batch_ratio = 1 + logger.info(f'Could not determine GPU memory, using default batch_ratio: {batch_ratio}') + + batch_model = BatchAnalyze(model_manager, batch_ratio, formula_enable, table_enable) + results = batch_model(images_with_extra_info) + + clean_memory(get_device()) + + return results \ No newline at end of file diff --git a/vendor/mineru/mineru/backend/pipeline/pipeline_magic_model.py b/vendor/mineru/mineru/backend/pipeline/pipeline_magic_model.py new file mode 100644 index 0000000000000000000000000000000000000000..a88618ed6d98d79bfba59ad29c7366eda8c33f36 --- /dev/null +++ b/vendor/mineru/mineru/backend/pipeline/pipeline_magic_model.py @@ -0,0 +1,501 @@ +from mineru.utils.boxbase import bbox_relative_pos, calculate_iou, bbox_distance, is_in, get_minbox_if_overlap_by_ratio +from mineru.utils.enum_class import CategoryId, ContentType + + +class MagicModel: + """每个函数没有得到元素的时候返回空list.""" + def __init__(self, page_model_info: dict, scale: float): + self.__page_model_info = page_model_info + self.__scale = scale + """为所有模型数据添加bbox信息(缩放,poly->bbox)""" + self.__fix_axis() + """删除置信度特别低的模型数据(<0.05),提高质量""" + self.__fix_by_remove_low_confidence() + """删除高iou(>0.9)数据中置信度较低的那个""" + self.__fix_by_remove_high_iou_and_low_confidence() + """将部分tbale_footnote修正为image_footnote""" + self.__fix_footnote() + """处理重叠的image_body和table_body""" + self.__fix_by_remove_overlap_image_table_body() + + def __fix_by_remove_overlap_image_table_body(self): + need_remove_list = [] + layout_dets = self.__page_model_info['layout_dets'] + image_blocks = list(filter( + lambda x: x['category_id'] == CategoryId.ImageBody, layout_dets + )) + table_blocks = list(filter( + lambda x: x['category_id'] == CategoryId.TableBody, layout_dets + )) + + def add_need_remove_block(blocks): + for i in range(len(blocks)): + for j in range(i + 1, len(blocks)): + block1 = blocks[i] + block2 = blocks[j] + overlap_box = get_minbox_if_overlap_by_ratio( + block1['bbox'], block2['bbox'], 0.8 + ) + if overlap_box is not None: + # 判断哪个区块的面积更小,移除较小的区块 + area1 = (block1['bbox'][2] - block1['bbox'][0]) * (block1['bbox'][3] - block1['bbox'][1]) + area2 = (block2['bbox'][2] - block2['bbox'][0]) * (block2['bbox'][3] - block2['bbox'][1]) + + if area1 <= area2: + block_to_remove = block1 + large_block = block2 + else: + block_to_remove = block2 + large_block = block1 + + if block_to_remove not in need_remove_list: + # 扩展大区块的边界框 + x1, y1, x2, y2 = large_block['bbox'] + sx1, sy1, sx2, sy2 = block_to_remove['bbox'] + x1 = min(x1, sx1) + y1 = min(y1, sy1) + x2 = max(x2, sx2) + y2 = max(y2, sy2) + large_block['bbox'] = [x1, y1, x2, y2] + need_remove_list.append(block_to_remove) + + # 处理图像-图像重叠 + add_need_remove_block(image_blocks) + # 处理表格-表格重叠 + add_need_remove_block(table_blocks) + + # 从布局中移除标记的区块 + for need_remove in need_remove_list: + if need_remove in layout_dets: + layout_dets.remove(need_remove) + + + def __fix_axis(self): + need_remove_list = [] + layout_dets = self.__page_model_info['layout_dets'] + for layout_det in layout_dets: + x0, y0, _, _, x1, y1, _, _ = layout_det['poly'] + bbox = [ + int(x0 / self.__scale), + int(y0 / self.__scale), + int(x1 / self.__scale), + int(y1 / self.__scale), + ] + layout_det['bbox'] = bbox + # 删除高度或者宽度小于等于0的spans + if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0: + need_remove_list.append(layout_det) + for need_remove in need_remove_list: + layout_dets.remove(need_remove) + + def __fix_by_remove_low_confidence(self): + need_remove_list = [] + layout_dets = self.__page_model_info['layout_dets'] + for layout_det in layout_dets: + if layout_det['score'] <= 0.05: + need_remove_list.append(layout_det) + else: + continue + for need_remove in need_remove_list: + layout_dets.remove(need_remove) + + def __fix_by_remove_high_iou_and_low_confidence(self): + need_remove_list = [] + layout_dets = list(filter( + lambda x: x['category_id'] in [ + CategoryId.Title, + CategoryId.Text, + CategoryId.ImageBody, + CategoryId.ImageCaption, + CategoryId.TableBody, + CategoryId.TableCaption, + CategoryId.TableFootnote, + CategoryId.InterlineEquation_Layout, + CategoryId.InterlineEquationNumber_Layout, + ], self.__page_model_info['layout_dets'] + ) + ) + for i in range(len(layout_dets)): + for j in range(i + 1, len(layout_dets)): + layout_det1 = layout_dets[i] + layout_det2 = layout_dets[j] + + if calculate_iou(layout_det1['bbox'], layout_det2['bbox']) > 0.9: + + layout_det_need_remove = layout_det1 if layout_det1['score'] < layout_det2['score'] else layout_det2 + + if layout_det_need_remove not in need_remove_list: + need_remove_list.append(layout_det_need_remove) + + for need_remove in need_remove_list: + self.__page_model_info['layout_dets'].remove(need_remove) + + def __fix_footnote(self): + footnotes = [] + figures = [] + tables = [] + + for obj in self.__page_model_info['layout_dets']: + if obj['category_id'] == CategoryId.TableFootnote: + footnotes.append(obj) + elif obj['category_id'] == CategoryId.ImageBody: + figures.append(obj) + elif obj['category_id'] == CategoryId.TableBody: + tables.append(obj) + if len(footnotes) * len(figures) == 0: + continue + dis_figure_footnote = {} + dis_table_footnote = {} + + for i in range(len(footnotes)): + for j in range(len(figures)): + pos_flag_count = sum( + list( + map( + lambda x: 1 if x else 0, + bbox_relative_pos( + footnotes[i]['bbox'], figures[j]['bbox'] + ), + ) + ) + ) + if pos_flag_count > 1: + continue + dis_figure_footnote[i] = min( + self._bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']), + dis_figure_footnote.get(i, float('inf')), + ) + for i in range(len(footnotes)): + for j in range(len(tables)): + pos_flag_count = sum( + list( + map( + lambda x: 1 if x else 0, + bbox_relative_pos( + footnotes[i]['bbox'], tables[j]['bbox'] + ), + ) + ) + ) + if pos_flag_count > 1: + continue + + dis_table_footnote[i] = min( + self._bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']), + dis_table_footnote.get(i, float('inf')), + ) + for i in range(len(footnotes)): + if i not in dis_figure_footnote: + continue + if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]: + footnotes[i]['category_id'] = CategoryId.ImageFootnote + + def _bbox_distance(self, bbox1, bbox2): + left, right, bottom, top = bbox_relative_pos(bbox1, bbox2) + flags = [left, right, bottom, top] + count = sum([1 if v else 0 for v in flags]) + if count > 1: + return float('inf') + if left or right: + l1 = bbox1[3] - bbox1[1] + l2 = bbox2[3] - bbox2[1] + else: + l1 = bbox1[2] - bbox1[0] + l2 = bbox2[2] - bbox2[0] + + if l2 > l1 and (l2 - l1) / l1 > 0.3: + return float('inf') + + return bbox_distance(bbox1, bbox2) + + def __reduct_overlap(self, bboxes): + N = len(bboxes) + keep = [True] * N + for i in range(N): + for j in range(N): + if i == j: + continue + if is_in(bboxes[i]['bbox'], bboxes[j]['bbox']): + keep[i] = False + return [bboxes[i] for i in range(N) if keep[i]] + + def __tie_up_category_by_distance_v3( + self, + subject_category_id: int, + object_category_id: int, + ): + subjects = self.__reduct_overlap( + list( + map( + lambda x: {'bbox': x['bbox'], 'score': x['score']}, + filter( + lambda x: x['category_id'] == subject_category_id, + self.__page_model_info['layout_dets'], + ), + ) + ) + ) + objects = self.__reduct_overlap( + list( + map( + lambda x: {'bbox': x['bbox'], 'score': x['score']}, + filter( + lambda x: x['category_id'] == object_category_id, + self.__page_model_info['layout_dets'], + ), + ) + ) + ) + + ret = [] + N, M = len(subjects), len(objects) + subjects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2) + objects.sort(key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2) + + OBJ_IDX_OFFSET = 10000 + SUB_BIT_KIND, OBJ_BIT_KIND = 0, 1 + + all_boxes_with_idx = [(i, SUB_BIT_KIND, sub['bbox'][0], sub['bbox'][1]) for i, sub in enumerate(subjects)] + [(i + OBJ_IDX_OFFSET , OBJ_BIT_KIND, obj['bbox'][0], obj['bbox'][1]) for i, obj in enumerate(objects)] + seen_idx = set() + seen_sub_idx = set() + + while N > len(seen_sub_idx): + candidates = [] + for idx, kind, x0, y0 in all_boxes_with_idx: + if idx in seen_idx: + continue + candidates.append((idx, kind, x0, y0)) + + if len(candidates) == 0: + break + left_x = min([v[2] for v in candidates]) + top_y = min([v[3] for v in candidates]) + + candidates.sort(key=lambda x: (x[2]-left_x) ** 2 + (x[3] - top_y) ** 2) + + + fst_idx, fst_kind, left_x, top_y = candidates[0] + candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y)**2) + nxt = None + + for i in range(1, len(candidates)): + if candidates[i][1] ^ fst_kind == 1: + nxt = candidates[i] + break + if nxt is None: + break + + if fst_kind == SUB_BIT_KIND: + sub_idx, obj_idx = fst_idx, nxt[0] - OBJ_IDX_OFFSET + + else: + sub_idx, obj_idx = nxt[0], fst_idx - OBJ_IDX_OFFSET + + pair_dis = bbox_distance(subjects[sub_idx]['bbox'], objects[obj_idx]['bbox']) + nearest_dis = float('inf') + for i in range(N): + if i in seen_idx or i == sub_idx:continue + nearest_dis = min(nearest_dis, bbox_distance(subjects[i]['bbox'], objects[obj_idx]['bbox'])) + + if pair_dis >= 3*nearest_dis: + seen_idx.add(sub_idx) + continue + + seen_idx.add(sub_idx) + seen_idx.add(obj_idx + OBJ_IDX_OFFSET) + seen_sub_idx.add(sub_idx) + + ret.append( + { + 'sub_bbox': { + 'bbox': subjects[sub_idx]['bbox'], + 'score': subjects[sub_idx]['score'], + }, + 'obj_bboxes': [ + {'score': objects[obj_idx]['score'], 'bbox': objects[obj_idx]['bbox']} + ], + 'sub_idx': sub_idx, + } + ) + + for i in range(len(objects)): + j = i + OBJ_IDX_OFFSET + if j in seen_idx: + continue + seen_idx.add(j) + nearest_dis, nearest_sub_idx = float('inf'), -1 + for k in range(len(subjects)): + dis = bbox_distance(objects[i]['bbox'], subjects[k]['bbox']) + if dis < nearest_dis: + nearest_dis = dis + nearest_sub_idx = k + + for k in range(len(subjects)): + if k != nearest_sub_idx: continue + if k in seen_sub_idx: + for kk in range(len(ret)): + if ret[kk]['sub_idx'] == k: + ret[kk]['obj_bboxes'].append({'score': objects[i]['score'], 'bbox': objects[i]['bbox']}) + break + else: + ret.append( + { + 'sub_bbox': { + 'bbox': subjects[k]['bbox'], + 'score': subjects[k]['score'], + }, + 'obj_bboxes': [ + {'score': objects[i]['score'], 'bbox': objects[i]['bbox']} + ], + 'sub_idx': k, + } + ) + seen_sub_idx.add(k) + seen_idx.add(k) + + + for i in range(len(subjects)): + if i in seen_sub_idx: + continue + ret.append( + { + 'sub_bbox': { + 'bbox': subjects[i]['bbox'], + 'score': subjects[i]['score'], + }, + 'obj_bboxes': [], + 'sub_idx': i, + } + ) + + + return ret + + def get_imgs(self): + with_captions = self.__tie_up_category_by_distance_v3( + CategoryId.ImageBody, CategoryId.ImageCaption + ) + with_footnotes = self.__tie_up_category_by_distance_v3( + CategoryId.ImageBody, CategoryId.ImageFootnote + ) + ret = [] + for v in with_captions: + record = { + 'image_body': v['sub_bbox'], + 'image_caption_list': v['obj_bboxes'], + } + filter_idx = v['sub_idx'] + d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes)) + record['image_footnote_list'] = d['obj_bboxes'] + ret.append(record) + return ret + + def get_tables(self) -> list: + with_captions = self.__tie_up_category_by_distance_v3( + CategoryId.TableBody, CategoryId.TableCaption + ) + with_footnotes = self.__tie_up_category_by_distance_v3( + CategoryId.TableBody, CategoryId.TableFootnote + ) + ret = [] + for v in with_captions: + record = { + 'table_body': v['sub_bbox'], + 'table_caption_list': v['obj_bboxes'], + } + filter_idx = v['sub_idx'] + d = next(filter(lambda x: x['sub_idx'] == filter_idx, with_footnotes)) + record['table_footnote_list'] = d['obj_bboxes'] + ret.append(record) + return ret + + def get_equations(self) -> tuple[list, list, list]: # 有坐标,也有字 + inline_equations = self.__get_blocks_by_type( + CategoryId.InlineEquation, ['latex'] + ) + interline_equations = self.__get_blocks_by_type( + CategoryId.InterlineEquation_YOLO, ['latex'] + ) + interline_equations_blocks = self.__get_blocks_by_type( + CategoryId.InterlineEquation_Layout + ) + return inline_equations, interline_equations, interline_equations_blocks + + def get_discarded(self) -> list: # 自研模型,只有坐标 + blocks = self.__get_blocks_by_type(CategoryId.Abandon) + return blocks + + def get_text_blocks(self) -> list: # 自研模型搞的,只有坐标,没有字 + blocks = self.__get_blocks_by_type(CategoryId.Text) + return blocks + + def get_title_blocks(self) -> list: # 自研模型,只有坐标,没字 + blocks = self.__get_blocks_by_type(CategoryId.Title) + return blocks + + def get_all_spans(self) -> list: + + def remove_duplicate_spans(spans): + new_spans = [] + for span in spans: + if not any(span == existing_span for existing_span in new_spans): + new_spans.append(span) + return new_spans + + all_spans = [] + layout_dets = self.__page_model_info['layout_dets'] + allow_category_id_list = [ + CategoryId.ImageBody, + CategoryId.TableBody, + CategoryId.InlineEquation, + CategoryId.InterlineEquation_YOLO, + CategoryId.OcrText, + ] + """当成span拼接的""" + for layout_det in layout_dets: + category_id = layout_det['category_id'] + if category_id in allow_category_id_list: + span = {'bbox': layout_det['bbox'], 'score': layout_det['score']} + if category_id == CategoryId.ImageBody: + span['type'] = ContentType.IMAGE + elif category_id == CategoryId.TableBody: + # 获取table模型结果 + latex = layout_det.get('latex', None) + html = layout_det.get('html', None) + if latex: + span['latex'] = latex + elif html: + span['html'] = html + span['type'] = ContentType.TABLE + elif category_id == CategoryId.InlineEquation: + span['content'] = layout_det['latex'] + span['type'] = ContentType.INLINE_EQUATION + elif category_id == CategoryId.InterlineEquation_YOLO: + span['content'] = layout_det['latex'] + span['type'] = ContentType.INTERLINE_EQUATION + elif category_id == CategoryId.OcrText: + span['content'] = layout_det['text'] + span['type'] = ContentType.TEXT + all_spans.append(span) + return remove_duplicate_spans(all_spans) + + def __get_blocks_by_type( + self, category_type: int, extra_col=None + ) -> list: + if extra_col is None: + extra_col = [] + blocks = [] + layout_dets = self.__page_model_info.get('layout_dets', []) + for item in layout_dets: + category_id = item.get('category_id', -1) + bbox = item.get('bbox', None) + + if category_id == category_type: + block = { + 'bbox': bbox, + 'score': item.get('score'), + } + for col in extra_col: + block[col] = item.get(col, None) + blocks.append(block) + return blocks \ No newline at end of file diff --git a/vendor/mineru/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py b/vendor/mineru/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py new file mode 100644 index 0000000000000000000000000000000000000000..f8f2b61464c2e9316ee2b2dd94869bbd92077221 --- /dev/null +++ b/vendor/mineru/mineru/backend/pipeline/pipeline_middle_json_mkcontent.py @@ -0,0 +1,298 @@ +import re +from loguru import logger + +from mineru.utils.config_reader import get_latex_delimiter_config +from mineru.backend.pipeline.para_split import ListLineTag +from mineru.utils.enum_class import BlockType, ContentType, MakeMode +from mineru.utils.language import detect_lang + + +def __is_hyphen_at_line_end(line): + """Check if a line ends with one or more letters followed by a hyphen. + + Args: + line (str): The line of text to check. + + Returns: + bool: True if the line ends with one or more letters followed by a hyphen, False otherwise. + """ + # Use regex to check if the line ends with one or more letters followed by a hyphen + return bool(re.search(r'[A-Za-z]+-\s*$', line)) + + +def make_blocks_to_markdown(paras_of_layout, + mode, + img_buket_path='', + ): + page_markdown = [] + for para_block in paras_of_layout: + para_text = '' + para_type = para_block['type'] + if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]: + para_text = merge_para_with_text(para_block) + elif para_type == BlockType.TITLE: + title_level = get_title_level(para_block) + para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}' + elif para_type == BlockType.INTERLINE_EQUATION: + if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0: + continue + if para_block['lines'][0]['spans'][0].get('content', ''): + para_text = merge_para_with_text(para_block) + else: + para_text += f"![]({img_buket_path}/{para_block['lines'][0]['spans'][0]['image_path']})" + elif para_type == BlockType.IMAGE: + if mode == MakeMode.NLP_MD: + continue + elif mode == MakeMode.MM_MD: + # 检测是否存在图片脚注 + has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks']) + # 如果存在图片脚注,则将图片脚注拼接到图片正文后面 + if has_image_footnote: + for block in para_block['blocks']: # 1st.拼image_caption + if block['type'] == BlockType.IMAGE_CAPTION: + para_text += merge_para_with_text(block) + ' \n' + for block in para_block['blocks']: # 2nd.拼image_body + if block['type'] == BlockType.IMAGE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.IMAGE: + if span.get('image_path', ''): + para_text += f"![]({img_buket_path}/{span['image_path']})" + for block in para_block['blocks']: # 3rd.拼image_footnote + if block['type'] == BlockType.IMAGE_FOOTNOTE: + para_text += ' \n' + merge_para_with_text(block) + else: + for block in para_block['blocks']: # 1st.拼image_body + if block['type'] == BlockType.IMAGE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.IMAGE: + if span.get('image_path', ''): + para_text += f"![]({img_buket_path}/{span['image_path']})" + for block in para_block['blocks']: # 2nd.拼image_caption + if block['type'] == BlockType.IMAGE_CAPTION: + para_text += ' \n' + merge_para_with_text(block) + elif para_type == BlockType.TABLE: + if mode == MakeMode.NLP_MD: + continue + elif mode == MakeMode.MM_MD: + for block in para_block['blocks']: # 1st.拼table_caption + if block['type'] == BlockType.TABLE_CAPTION: + para_text += merge_para_with_text(block) + ' \n' + for block in para_block['blocks']: # 2nd.拼table_body + if block['type'] == BlockType.TABLE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.TABLE: + # if processed by table model + if span.get('html', ''): + para_text += f"\n{span['html']}\n" + elif span.get('image_path', ''): + para_text += f"![]({img_buket_path}/{span['image_path']})" + for block in para_block['blocks']: # 3rd.拼table_footnote + if block['type'] == BlockType.TABLE_FOOTNOTE: + para_text += '\n' + merge_para_with_text(block) + ' ' + + if para_text.strip() == '': + continue + else: + # page_markdown.append(para_text.strip() + ' ') + page_markdown.append(para_text.strip()) + + return page_markdown + + +def full_to_half(text: str) -> str: + """Convert full-width characters to half-width characters using code point manipulation. + + Args: + text: String containing full-width characters + + Returns: + String with full-width characters converted to half-width + """ + result = [] + for char in text: + code = ord(char) + # Full-width letters and numbers (FF21-FF3A for A-Z, FF41-FF5A for a-z, FF10-FF19 for 0-9) + if (0xFF21 <= code <= 0xFF3A) or (0xFF41 <= code <= 0xFF5A) or (0xFF10 <= code <= 0xFF19): + result.append(chr(code - 0xFEE0)) # Shift to ASCII range + else: + result.append(char) + return ''.join(result) + +latex_delimiters_config = get_latex_delimiter_config() + +default_delimiters = { + 'display': {'left': '$$', 'right': '$$'}, + 'inline': {'left': '$', 'right': '$'} +} + +delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters + +display_left_delimiter = delimiters['display']['left'] +display_right_delimiter = delimiters['display']['right'] +inline_left_delimiter = delimiters['inline']['left'] +inline_right_delimiter = delimiters['inline']['right'] + +def merge_para_with_text(para_block): + block_text = '' + for line in para_block['lines']: + for span in line['spans']: + if span['type'] in [ContentType.TEXT]: + span['content'] = full_to_half(span['content']) + block_text += span['content'] + block_lang = detect_lang(block_text) + + para_text = '' + for i, line in enumerate(para_block['lines']): + + if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False): + para_text += ' \n' + + for j, span in enumerate(line['spans']): + + span_type = span['type'] + content = '' + if span_type == ContentType.TEXT: + content = escape_special_markdown_char(span['content']) + elif span_type == ContentType.INLINE_EQUATION: + if span.get('content', ''): + content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}" + elif span_type == ContentType.INTERLINE_EQUATION: + if span.get('content', ''): + content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n" + + content = content.strip() + + if content: + langs = ['zh', 'ja', 'ko'] + # logger.info(f'block_lang: {block_lang}, content: {content}') + if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格 + if j == len(line['spans']) - 1 and span_type not in [ContentType.INLINE_EQUATION]: + para_text += content + else: + para_text += f'{content} ' + else: + if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]: + # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除 + if j == len(line['spans'])-1 and span_type == ContentType.TEXT and __is_hyphen_at_line_end(content): + para_text += content[:-1] + else: # 西方文本语境下 content间需要空格分隔 + para_text += f'{content} ' + elif span_type == ContentType.INTERLINE_EQUATION: + para_text += content + else: + continue + + return para_text + + +def make_blocks_to_content_list(para_block, img_buket_path, page_idx): + para_type = para_block['type'] + para_content = {} + if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]: + para_content = { + 'type': ContentType.TEXT, + 'text': merge_para_with_text(para_block), + } + elif para_type == BlockType.TITLE: + para_content = { + 'type': ContentType.TEXT, + 'text': merge_para_with_text(para_block), + } + title_level = get_title_level(para_block) + if title_level != 0: + para_content['text_level'] = title_level + elif para_type == BlockType.INTERLINE_EQUATION: + if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0: + return None + para_content = { + 'type': ContentType.EQUATION, + 'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}", + } + if para_block['lines'][0]['spans'][0].get('content', ''): + para_content['text'] = merge_para_with_text(para_block) + para_content['text_format'] = 'latex' + elif para_type == BlockType.IMAGE: + para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []} + for block in para_block['blocks']: + if block['type'] == BlockType.IMAGE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.IMAGE: + if span.get('image_path', ''): + para_content['img_path'] = f"{img_buket_path}/{span['image_path']}" + if block['type'] == BlockType.IMAGE_CAPTION: + para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block)) + if block['type'] == BlockType.IMAGE_FOOTNOTE: + para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block)) + elif para_type == BlockType.TABLE: + para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []} + for block in para_block['blocks']: + if block['type'] == BlockType.TABLE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.TABLE: + if span.get('html', ''): + para_content[BlockType.TABLE_BODY] = f"{span['html']}" + + if span.get('image_path', ''): + para_content['img_path'] = f"{img_buket_path}/{span['image_path']}" + + if block['type'] == BlockType.TABLE_CAPTION: + para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block)) + if block['type'] == BlockType.TABLE_FOOTNOTE: + para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block)) + + para_content['page_idx'] = page_idx + + return para_content + + +def union_make(pdf_info_dict: list, + make_mode: str, + img_buket_path: str = '', + ): + output_content = [] + for page_info in pdf_info_dict: + paras_of_layout = page_info.get('para_blocks') + page_idx = page_info.get('page_idx') + if not paras_of_layout: + continue + if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: + page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path) + output_content.extend(page_markdown) + elif make_mode == MakeMode.CONTENT_LIST: + for para_block in paras_of_layout: + para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx) + if para_content: + output_content.append(para_content) + + if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: + return '\n\n'.join(output_content) + elif make_mode == MakeMode.CONTENT_LIST: + return output_content + else: + logger.error(f"Unsupported make mode: {make_mode}") + return None + + +def get_title_level(block): + title_level = block.get('level', 1) + if title_level > 4: + title_level = 4 + elif title_level < 1: + title_level = 0 + return title_level + + +def escape_special_markdown_char(content): + """ + 转义正文里对markdown语法有特殊意义的字符 + """ + special_chars = ["*", "`", "~", "$"] + for char in special_chars: + content = content.replace(char, "\\" + char) + + return content \ No newline at end of file diff --git a/vendor/mineru/mineru/backend/vlm/__init__.py b/vendor/mineru/mineru/backend/vlm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/backend/vlm/base_predictor.py b/vendor/mineru/mineru/backend/vlm/base_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..f65b4a482cc8a84154c50e8d5acc2a22a91a7df4 --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/base_predictor.py @@ -0,0 +1,186 @@ +import asyncio +from abc import ABC, abstractmethod +from typing import AsyncIterable, Iterable, List, Optional, Union + +DEFAULT_SYSTEM_PROMPT = ( + "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers." +) +DEFAULT_USER_PROMPT = "Document Parsing:" +DEFAULT_TEMPERATURE = 0.0 +DEFAULT_TOP_P = 0.8 +DEFAULT_TOP_K = 20 +DEFAULT_REPETITION_PENALTY = 1.0 +DEFAULT_PRESENCE_PENALTY = 0.0 +DEFAULT_NO_REPEAT_NGRAM_SIZE = 100 +DEFAULT_MAX_NEW_TOKENS = 16384 + + +class BasePredictor(ABC): + system_prompt = DEFAULT_SYSTEM_PROMPT + + def __init__( + self, + temperature: float = DEFAULT_TEMPERATURE, + top_p: float = DEFAULT_TOP_P, + top_k: int = DEFAULT_TOP_K, + repetition_penalty: float = DEFAULT_REPETITION_PENALTY, + presence_penalty: float = DEFAULT_PRESENCE_PENALTY, + no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE, + max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, + ) -> None: + self.temperature = temperature + self.top_p = top_p + self.top_k = top_k + self.repetition_penalty = repetition_penalty + self.presence_penalty = presence_penalty + self.no_repeat_ngram_size = no_repeat_ngram_size + self.max_new_tokens = max_new_tokens + + @abstractmethod + def predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> str: ... + + @abstractmethod + def batch_predict( + self, + images: List[str] | List[bytes], + prompts: Union[List[str], str] = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> List[str]: ... + + @abstractmethod + def stream_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> Iterable[str]: ... + + async def aio_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> str: + return await asyncio.to_thread( + self.predict, + image, + prompt, + temperature, + top_p, + top_k, + repetition_penalty, + presence_penalty, + no_repeat_ngram_size, + max_new_tokens, + ) + + async def aio_batch_predict( + self, + images: List[str] | List[bytes], + prompts: Union[List[str], str] = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> List[str]: + return await asyncio.to_thread( + self.batch_predict, + images, + prompts, + temperature, + top_p, + top_k, + repetition_penalty, + presence_penalty, + no_repeat_ngram_size, + max_new_tokens, + ) + + async def aio_stream_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> AsyncIterable[str]: + queue = asyncio.Queue() + loop = asyncio.get_running_loop() + + def synced_predict(): + for chunk in self.stream_predict( + image=image, + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ): + asyncio.run_coroutine_threadsafe(queue.put(chunk), loop) + asyncio.run_coroutine_threadsafe(queue.put(None), loop) + + asyncio.create_task( + asyncio.to_thread(synced_predict), + ) + + while True: + chunk = await queue.get() + if chunk is None: + return + assert isinstance(chunk, str) + yield chunk + + def build_prompt(self, prompt: str) -> str: + if prompt.startswith("<|im_start|>"): + return prompt + if not prompt: + prompt = DEFAULT_USER_PROMPT + + return f"<|im_start|>system\n{self.system_prompt}<|im_end|><|im_start|>user\n\n{prompt}<|im_end|><|im_start|>assistant\n" + # Modify here. We add <|box_start|> at the end of the prompt to force the model to generate bounding box. + # if "Document OCR" in prompt: + # return f"<|im_start|>system\n{self.system_prompt}<|im_end|><|im_start|>user\n\n{prompt}<|im_end|><|im_start|>assistant\n<|box_start|>" + # else: + # return f"<|im_start|>system\n{self.system_prompt}<|im_end|><|im_start|>user\n\n{prompt}<|im_end|><|im_start|>assistant\n" + + def close(self): + pass diff --git a/vendor/mineru/mineru/backend/vlm/hf_predictor.py b/vendor/mineru/mineru/backend/vlm/hf_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..f3cd69d55e549d45a132ec541c54f756f078982e --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/hf_predictor.py @@ -0,0 +1,211 @@ +from io import BytesIO +from typing import Iterable, List, Optional, Union + +import torch +from PIL import Image +from tqdm import tqdm +from transformers import AutoTokenizer, BitsAndBytesConfig + +from ...model.vlm_hf_model import Mineru2QwenForCausalLM +from ...model.vlm_hf_model.image_processing_mineru2 import process_images +from .base_predictor import ( + DEFAULT_MAX_NEW_TOKENS, + DEFAULT_NO_REPEAT_NGRAM_SIZE, + DEFAULT_PRESENCE_PENALTY, + DEFAULT_REPETITION_PENALTY, + DEFAULT_TEMPERATURE, + DEFAULT_TOP_K, + DEFAULT_TOP_P, + BasePredictor, +) +from .utils import load_resource + + +class HuggingfacePredictor(BasePredictor): + def __init__( + self, + model_path: str, + device_map="auto", + device="cuda", + torch_dtype="auto", + load_in_8bit=False, + load_in_4bit=False, + use_flash_attn=False, + temperature: float = DEFAULT_TEMPERATURE, + top_p: float = DEFAULT_TOP_P, + top_k: int = DEFAULT_TOP_K, + repetition_penalty: float = DEFAULT_REPETITION_PENALTY, + presence_penalty: float = DEFAULT_PRESENCE_PENALTY, + no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE, + max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, + **kwargs, + ): + super().__init__( + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ) + + kwargs = {"device_map": device_map, **kwargs} + + if device != "cuda": + kwargs["device_map"] = {"": device} + + if load_in_8bit: + kwargs["load_in_8bit"] = True + elif load_in_4bit: + kwargs["load_in_4bit"] = True + kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + else: + kwargs["torch_dtype"] = torch_dtype + + if use_flash_attn: + kwargs["attn_implementation"] = "flash_attention_2" + + self.tokenizer = AutoTokenizer.from_pretrained(model_path) + self.model = Mineru2QwenForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=True, + **kwargs, + ) + setattr(self.model.config, "_name_or_path", model_path) + self.model.eval() + + vision_tower = self.model.get_model().vision_tower + if device_map != "auto": + vision_tower.to(device=device_map, dtype=self.model.dtype) + + self.image_processor = vision_tower.image_processor + self.eos_token_id = self.model.config.eos_token_id + + def predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + **kwargs, + ) -> str: + prompt = self.build_prompt(prompt) + + if temperature is None: + temperature = self.temperature + if top_p is None: + top_p = self.top_p + if top_k is None: + top_k = self.top_k + if repetition_penalty is None: + repetition_penalty = self.repetition_penalty + if no_repeat_ngram_size is None: + no_repeat_ngram_size = self.no_repeat_ngram_size + if max_new_tokens is None: + max_new_tokens = self.max_new_tokens + + do_sample = (temperature > 0.0) and (top_k > 1) + + generate_kwargs = { + "repetition_penalty": repetition_penalty, + "no_repeat_ngram_size": no_repeat_ngram_size, + "max_new_tokens": max_new_tokens, + "do_sample": do_sample, + } + if do_sample: + generate_kwargs["temperature"] = temperature + generate_kwargs["top_p"] = top_p + generate_kwargs["top_k"] = top_k + + if isinstance(image, str): + image = load_resource(image) + + image_obj = Image.open(BytesIO(image)) + image_tensor = process_images([image_obj], self.image_processor, self.model.config) + image_tensor = image_tensor[0].unsqueeze(0) + image_tensor = image_tensor.to(device=self.model.device, dtype=self.model.dtype) + image_sizes = [[*image_obj.size]] + + input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids + input_ids = input_ids.to(device=self.model.device) + + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids, + images=image_tensor, + image_sizes=image_sizes, + use_cache=True, + **generate_kwargs, + **kwargs, + ) + + # Remove the last token if it is the eos_token_id + if len(output_ids[0]) > 0 and output_ids[0, -1] == self.eos_token_id: + output_ids = output_ids[:, :-1] + + output = self.tokenizer.batch_decode( + output_ids, + skip_special_tokens=False, + )[0].strip() + + return output + + def batch_predict( + self, + images: List[str] | List[bytes], + prompts: Union[List[str], str] = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, # not supported by hf + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + **kwargs, + ) -> List[str]: + if not isinstance(prompts, list): + prompts = [prompts] * len(images) + + assert len(prompts) == len(images), "Length of prompts and images must match." + + outputs = [] + for prompt, image in tqdm(zip(prompts, images), total=len(images), desc="Predict"): + output = self.predict( + image, + prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + **kwargs, + ) + outputs.append(output) + return outputs + + def stream_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> Iterable[str]: + raise NotImplementedError("Streaming is not supported yet.") diff --git a/vendor/mineru/mineru/backend/vlm/predictor.py b/vendor/mineru/mineru/backend/vlm/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..0d96844555ff7451599f9c15e62741643a78bf3c --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/predictor.py @@ -0,0 +1,111 @@ +# Copyright (c) Opendatalab. All rights reserved. + +import time + +from loguru import logger + +from .base_predictor import ( + DEFAULT_MAX_NEW_TOKENS, + DEFAULT_NO_REPEAT_NGRAM_SIZE, + DEFAULT_PRESENCE_PENALTY, + DEFAULT_REPETITION_PENALTY, + DEFAULT_TEMPERATURE, + DEFAULT_TOP_K, + DEFAULT_TOP_P, + BasePredictor, +) +from .sglang_client_predictor import SglangClientPredictor + +hf_loaded = False +try: + from .hf_predictor import HuggingfacePredictor + + hf_loaded = True +except ImportError as e: + logger.warning("hf is not installed. If you are not using transformers, you can ignore this warning.") + +engine_loaded = False +try: + from sglang.srt.server_args import ServerArgs + + from .sglang_engine_predictor import SglangEnginePredictor + + engine_loaded = True +except Exception as e: + logger.warning("sglang is not installed. If you are not using sglang, you can ignore this warning.") + + +def get_predictor( + backend: str = "sglang-client", + model_path: str | None = None, + server_url: str | None = None, + temperature: float = DEFAULT_TEMPERATURE, + top_p: float = DEFAULT_TOP_P, + top_k: int = DEFAULT_TOP_K, + repetition_penalty: float = DEFAULT_REPETITION_PENALTY, + presence_penalty: float = DEFAULT_PRESENCE_PENALTY, + no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE, + max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, + http_timeout: int = 600, + **kwargs, +) -> BasePredictor: + start_time = time.time() + + if backend == "transformers": + if not model_path: + raise ValueError("model_path must be provided for transformers backend.") + if not hf_loaded: + raise ImportError( + "transformers is not installed, so huggingface backend cannot be used. " + "If you need to use huggingface backend, please install transformers first." + ) + predictor = HuggingfacePredictor( + model_path=model_path, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + **kwargs, + ) + elif backend == "sglang-engine": + if not model_path: + raise ValueError("model_path must be provided for sglang-engine backend.") + if not engine_loaded: + raise ImportError( + "sglang is not installed, so sglang-engine backend cannot be used. " + "If you need to use sglang-engine backend for inference, " + "please install sglang[all]==0.4.8 or a newer version." + ) + predictor = SglangEnginePredictor( + server_args=ServerArgs(model_path, **kwargs), + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ) + elif backend == "sglang-client": + if not server_url: + raise ValueError("server_url must be provided for sglang-client backend.") + predictor = SglangClientPredictor( + server_url=server_url, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + http_timeout=http_timeout, + ) + else: + raise ValueError(f"Unsupported backend: {backend}. Supports: transformers, sglang-engine, sglang-client.") + + elapsed = round(time.time() - start_time, 2) + logger.info(f"get_predictor cost: {elapsed}s") + return predictor diff --git a/vendor/mineru/mineru/backend/vlm/sglang_client_predictor.py b/vendor/mineru/mineru/backend/vlm/sglang_client_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..e886184cd7eea6522623c9215ee04284e95130ea --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/sglang_client_predictor.py @@ -0,0 +1,443 @@ +import asyncio +import json +import re +from base64 import b64encode +from typing import AsyncIterable, Iterable, List, Optional, Set, Tuple, Union + +import httpx + +from .base_predictor import ( + DEFAULT_MAX_NEW_TOKENS, + DEFAULT_NO_REPEAT_NGRAM_SIZE, + DEFAULT_PRESENCE_PENALTY, + DEFAULT_REPETITION_PENALTY, + DEFAULT_TEMPERATURE, + DEFAULT_TOP_K, + DEFAULT_TOP_P, + BasePredictor, +) +from .utils import aio_load_resource, load_resource + + +class SglangClientPredictor(BasePredictor): + def __init__( + self, + server_url: str, + temperature: float = DEFAULT_TEMPERATURE, + top_p: float = DEFAULT_TOP_P, + top_k: int = DEFAULT_TOP_K, + repetition_penalty: float = DEFAULT_REPETITION_PENALTY, + presence_penalty: float = DEFAULT_PRESENCE_PENALTY, + no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE, + max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, + http_timeout: int = 600, + ) -> None: + super().__init__( + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ) + self.http_timeout = http_timeout + + base_url = self.get_base_url(server_url) + self.check_server_health(base_url) + self.model_path = self.get_model_path(base_url) + self.server_url = f"{base_url}/generate" + + @staticmethod + def get_base_url(server_url: str) -> str: + matched = re.match(r"^(https?://[^/]+)", server_url) + if not matched: + raise ValueError(f"Invalid server URL: {server_url}") + return matched.group(1) + + def check_server_health(self, base_url: str): + try: + response = httpx.get(f"{base_url}/health_generate", timeout=self.http_timeout) + except httpx.ConnectError: + raise RuntimeError(f"Failed to connect to server {base_url}. Please check if the server is running.") + if response.status_code != 200: + raise RuntimeError( + f"Server {base_url} is not healthy. Status code: {response.status_code}, response body: {response.text}" + ) + + def get_model_path(self, base_url: str) -> str: + try: + response = httpx.get(f"{base_url}/get_model_info", timeout=self.http_timeout) + except httpx.ConnectError: + raise RuntimeError(f"Failed to connect to server {base_url}. Please check if the server is running.") + if response.status_code != 200: + raise RuntimeError( + f"Failed to get model info from {base_url}. Status code: {response.status_code}, response body: {response.text}" + ) + return response.json()["model_path"] + + def build_sampling_params( + self, + temperature: Optional[float], + top_p: Optional[float], + top_k: Optional[int], + repetition_penalty: Optional[float], + presence_penalty: Optional[float], + no_repeat_ngram_size: Optional[int], + max_new_tokens: Optional[int], + ) -> dict: + if temperature is None: + temperature = self.temperature + if top_p is None: + top_p = self.top_p + if top_k is None: + top_k = self.top_k + if repetition_penalty is None: + repetition_penalty = self.repetition_penalty + if presence_penalty is None: + presence_penalty = self.presence_penalty + if no_repeat_ngram_size is None: + no_repeat_ngram_size = self.no_repeat_ngram_size + if max_new_tokens is None: + max_new_tokens = self.max_new_tokens + + # see SamplingParams for more details + return { + "temperature": temperature, + "top_p": top_p, + "top_k": top_k, + "repetition_penalty": repetition_penalty, + "presence_penalty": presence_penalty, + "custom_params": { + "no_repeat_ngram_size": no_repeat_ngram_size, + }, + "max_new_tokens": max_new_tokens, + "skip_special_tokens": False, + } + + def build_request_body( + self, + image: bytes, + prompt: str, + sampling_params: dict, + ) -> dict: + image_base64 = b64encode(image).decode("utf-8") + return { + "text": prompt, + "image_data": image_base64, + "sampling_params": sampling_params, + "modalities": ["image"], + } + + def predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> str: + prompt = self.build_prompt(prompt) + + sampling_params = self.build_sampling_params( + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ) + + if isinstance(image, str): + image = load_resource(image) + + request_body = self.build_request_body(image, prompt, sampling_params) + response = httpx.post(self.server_url, json=request_body, timeout=self.http_timeout) + response_body = response.json() + return response_body["text"] + + def batch_predict( + self, + images: List[str] | List[bytes], + prompts: Union[List[str], str] = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + max_concurrency: int = 100, + ) -> List[str]: + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = None + + task = self.aio_batch_predict( + images=images, + prompts=prompts, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + max_concurrency=max_concurrency, + ) + + if loop is not None: + return loop.run_until_complete(task) + else: + return asyncio.run(task) + + def stream_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> Iterable[str]: + prompt = self.build_prompt(prompt) + + sampling_params = self.build_sampling_params( + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ) + + if isinstance(image, str): + image = load_resource(image) + + request_body = self.build_request_body(image, prompt, sampling_params) + request_body["stream"] = True + + with httpx.stream( + "POST", + self.server_url, + json=request_body, + timeout=self.http_timeout, + ) as response: + pos = 0 + for chunk in response.iter_lines(): + if not (chunk or "").startswith("data:"): + continue + if chunk == "data: [DONE]": + break + data = json.loads(chunk[5:].strip("\n")) + chunk_text = data["text"][pos:] + # meta_info = data["meta_info"] + pos += len(chunk_text) + yield chunk_text + + async def aio_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + async_client: Optional[httpx.AsyncClient] = None, + ) -> str: + prompt = self.build_prompt(prompt) + + sampling_params = self.build_sampling_params( + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ) + + if isinstance(image, str): + image = await aio_load_resource(image) + + request_body = self.build_request_body(image, prompt, sampling_params) + + if async_client is None: + async with httpx.AsyncClient(timeout=self.http_timeout) as client: + response = await client.post(self.server_url, json=request_body) + response_body = response.json() + else: + response = await async_client.post(self.server_url, json=request_body) + response_body = response.json() + + return response_body["text"] + + async def aio_batch_predict( + self, + images: List[str] | List[bytes], + prompts: Union[List[str], str] = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + max_concurrency: int = 100, + ) -> List[str]: + if not isinstance(prompts, list): + prompts = [prompts] * len(images) + + assert len(prompts) == len(images), "Length of prompts and images must match." + + semaphore = asyncio.Semaphore(max_concurrency) + outputs = [""] * len(images) + + async def predict_with_semaphore( + idx: int, + image: str | bytes, + prompt: str, + async_client: httpx.AsyncClient, + ): + async with semaphore: + output = await self.aio_predict( + image=image, + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + async_client=async_client, + ) + outputs[idx] = output + + async with httpx.AsyncClient(timeout=self.http_timeout) as client: + tasks = [] + for idx, (prompt, image) in enumerate(zip(prompts, images)): + tasks.append(predict_with_semaphore(idx, image, prompt, client)) + await asyncio.gather(*tasks) + + return outputs + + async def aio_batch_predict_as_iter( + self, + images: List[str] | List[bytes], + prompts: Union[List[str], str] = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + max_concurrency: int = 100, + ) -> AsyncIterable[Tuple[int, str]]: + if not isinstance(prompts, list): + prompts = [prompts] * len(images) + + assert len(prompts) == len(images), "Length of prompts and images must match." + + semaphore = asyncio.Semaphore(max_concurrency) + + async def predict_with_semaphore( + idx: int, + image: str | bytes, + prompt: str, + async_client: httpx.AsyncClient, + ): + async with semaphore: + output = await self.aio_predict( + image=image, + prompt=prompt, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + async_client=async_client, + ) + return (idx, output) + + async with httpx.AsyncClient(timeout=self.http_timeout) as client: + pending: Set[asyncio.Task[Tuple[int, str]]] = set() + + for idx, (prompt, image) in enumerate(zip(prompts, images)): + pending.add( + asyncio.create_task( + predict_with_semaphore(idx, image, prompt, client), + ) + ) + + while len(pending) > 0: + done, pending = await asyncio.wait( + pending, + return_when=asyncio.FIRST_COMPLETED, + ) + for task in done: + yield task.result() + + async def aio_stream_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> AsyncIterable[str]: + prompt = self.build_prompt(prompt) + + sampling_params = self.build_sampling_params( + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ) + + if isinstance(image, str): + image = await aio_load_resource(image) + + request_body = self.build_request_body(image, prompt, sampling_params) + request_body["stream"] = True + + async with httpx.AsyncClient(timeout=self.http_timeout) as client: + async with client.stream( + "POST", + self.server_url, + json=request_body, + ) as response: + pos = 0 + async for chunk in response.aiter_lines(): + if not (chunk or "").startswith("data:"): + continue + if chunk == "data: [DONE]": + break + data = json.loads(chunk[5:].strip("\n")) + chunk_text = data["text"][pos:] + # meta_info = data["meta_info"] + pos += len(chunk_text) + yield chunk_text diff --git a/vendor/mineru/mineru/backend/vlm/sglang_engine_predictor.py b/vendor/mineru/mineru/backend/vlm/sglang_engine_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..0a167c633ae47b1533810c22e76c5c63736ecb96 --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/sglang_engine_predictor.py @@ -0,0 +1,246 @@ +from base64 import b64encode +from typing import AsyncIterable, Iterable, List, Optional, Union + +from sglang.srt.server_args import ServerArgs + +from ...model.vlm_sglang_model.engine import BatchEngine +from .base_predictor import ( + DEFAULT_MAX_NEW_TOKENS, + DEFAULT_NO_REPEAT_NGRAM_SIZE, + DEFAULT_PRESENCE_PENALTY, + DEFAULT_REPETITION_PENALTY, + DEFAULT_TEMPERATURE, + DEFAULT_TOP_K, + DEFAULT_TOP_P, + BasePredictor, +) + + +class SglangEnginePredictor(BasePredictor): + def __init__( + self, + server_args: ServerArgs, + temperature: float = DEFAULT_TEMPERATURE, + top_p: float = DEFAULT_TOP_P, + top_k: int = DEFAULT_TOP_K, + repetition_penalty: float = DEFAULT_REPETITION_PENALTY, + presence_penalty: float = DEFAULT_PRESENCE_PENALTY, + no_repeat_ngram_size: int = DEFAULT_NO_REPEAT_NGRAM_SIZE, + max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, + ) -> None: + super().__init__( + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ) + self.engine = BatchEngine(server_args=server_args) + + def load_image_string(self, image: str | bytes) -> str: + if not isinstance(image, (str, bytes)): + raise ValueError("Image must be a string or bytes.") + if isinstance(image, bytes): + return b64encode(image).decode("utf-8") + if image.startswith("file://"): + return image[len("file://") :] + return image + + def predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> str: + return self.batch_predict( + [image], # type: ignore + [prompt], + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + )[0] + + def batch_predict( + self, + images: List[str] | List[bytes], + prompts: Union[List[str], str] = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> List[str]: + + if not isinstance(prompts, list): + prompts = [prompts] * len(images) + + assert len(prompts) == len(images), "Length of prompts and images must match." + prompts = [self.build_prompt(prompt) for prompt in prompts] + + if temperature is None: + temperature = self.temperature + if top_p is None: + top_p = self.top_p + if top_k is None: + top_k = self.top_k + if repetition_penalty is None: + repetition_penalty = self.repetition_penalty + if presence_penalty is None: + presence_penalty = self.presence_penalty + if no_repeat_ngram_size is None: + no_repeat_ngram_size = self.no_repeat_ngram_size + if max_new_tokens is None: + max_new_tokens = self.max_new_tokens + + # see SamplingParams for more details + sampling_params = { + "temperature": temperature, + "top_p": top_p, + "top_k": top_k, + "repetition_penalty": repetition_penalty, + "presence_penalty": presence_penalty, + "custom_params": { + "no_repeat_ngram_size": no_repeat_ngram_size, + }, + "max_new_tokens": max_new_tokens, + "skip_special_tokens": False, + } + + image_strings = [self.load_image_string(img) for img in images] + + output = self.engine.generate( + prompt=prompts, + image_data=image_strings, + sampling_params=sampling_params, + ) + return [item["text"] for item in output] + + def stream_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> Iterable[str]: + raise NotImplementedError("Streaming is not supported yet.") + + async def aio_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> str: + output = await self.aio_batch_predict( + [image], # type: ignore + [prompt], + temperature=temperature, + top_p=top_p, + top_k=top_k, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + max_new_tokens=max_new_tokens, + ) + return output[0] + + async def aio_batch_predict( + self, + images: List[str] | List[bytes], + prompts: Union[List[str], str] = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> List[str]: + + if not isinstance(prompts, list): + prompts = [prompts] * len(images) + + assert len(prompts) == len(images), "Length of prompts and images must match." + prompts = [self.build_prompt(prompt) for prompt in prompts] + + if temperature is None: + temperature = self.temperature + if top_p is None: + top_p = self.top_p + if top_k is None: + top_k = self.top_k + if repetition_penalty is None: + repetition_penalty = self.repetition_penalty + if presence_penalty is None: + presence_penalty = self.presence_penalty + if no_repeat_ngram_size is None: + no_repeat_ngram_size = self.no_repeat_ngram_size + if max_new_tokens is None: + max_new_tokens = self.max_new_tokens + + # see SamplingParams for more details + sampling_params = { + "temperature": temperature, + "top_p": top_p, + "top_k": top_k, + "repetition_penalty": repetition_penalty, + "presence_penalty": presence_penalty, + "custom_params": { + "no_repeat_ngram_size": no_repeat_ngram_size, + }, + "max_new_tokens": max_new_tokens, + "skip_special_tokens": False, + } + + image_strings = [self.load_image_string(img) for img in images] + + output = await self.engine.async_generate( + prompt=prompts, + image_data=image_strings, + sampling_params=sampling_params, + ) + ret = [] + for item in output: # type: ignore + ret.append(item["text"]) + return ret + + async def aio_stream_predict( + self, + image: str | bytes, + prompt: str = "", + temperature: Optional[float] = None, + top_p: Optional[float] = None, + top_k: Optional[int] = None, + repetition_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + max_new_tokens: Optional[int] = None, + ) -> AsyncIterable[str]: + raise NotImplementedError("Streaming is not supported yet.") + + def close(self): + self.engine.shutdown() diff --git a/vendor/mineru/mineru/backend/vlm/token_to_middle_json.py b/vendor/mineru/mineru/backend/vlm/token_to_middle_json.py new file mode 100644 index 0000000000000000000000000000000000000000..07878ef0aa5df61131335a6c24a89e4767c0ef0c --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/token_to_middle_json.py @@ -0,0 +1,113 @@ +import time +import cv2 +import numpy as np +from loguru import logger + +from mineru.backend.pipeline.model_init import AtomModelSingleton +from mineru.utils.config_reader import get_llm_aided_config +from mineru.utils.cut_image import cut_image_and_table +from mineru.utils.enum_class import ContentType +from mineru.utils.hash_utils import str_md5 +from mineru.backend.vlm.vlm_magic_model import MagicModel +from mineru.utils.llm_aided import llm_aided_title +from mineru.utils.pdf_image_tools import get_crop_img +from mineru.version import __version__ + + +def token_to_page_info(token, image_dict, page, image_writer, page_index) -> dict: + """将token转换为页面信息""" + # 解析token,提取坐标和类型 + # 假设token格式为:<|box_start|>x0 y0 x1 y1<|box_end|><|ref_start|>type<|ref_end|><|md_start|>content<|md_end|> + # 这里需要根据实际的token格式进行解析 + # 提取所有完整块,每个块从<|box_start|>开始到<|md_end|>或<|im_end|>结束 + + scale = image_dict["scale"] + page_pil_img = image_dict["img_pil"] + page_img_md5 = str_md5(image_dict["img_base64"]) + width, height = map(int, page.get_size()) + + magic_model = MagicModel(token, width, height) + image_blocks = magic_model.get_image_blocks() + table_blocks = magic_model.get_table_blocks() + title_blocks = magic_model.get_title_blocks() + + # 如果有标题优化需求,则对title_blocks截图det + llm_aided_config = get_llm_aided_config() + if llm_aided_config is not None: + title_aided_config = llm_aided_config.get('title_aided', None) + if title_aided_config is not None: + if title_aided_config.get('enable', False): + atom_model_manager = AtomModelSingleton() + ocr_model = atom_model_manager.get_atom_model( + atom_model_name='ocr', + ocr_show_log=False, + det_db_box_thresh=0.3, + lang='ch_lite' + ) + for title_block in title_blocks: + title_pil_img = get_crop_img(title_block['bbox'], page_pil_img, scale) + title_np_img = np.array(title_pil_img) + # 给title_pil_img添加上下左右各50像素白边padding + title_np_img = cv2.copyMakeBorder( + title_np_img, 50, 50, 50, 50, cv2.BORDER_CONSTANT, value=[255, 255, 255] + ) + title_img = cv2.cvtColor(title_np_img, cv2.COLOR_RGB2BGR) + ocr_det_res = ocr_model.ocr(title_img, rec=False)[0] + if len(ocr_det_res) > 0: + # 计算所有res的平均高度 + avg_height = np.mean([box[2][1] - box[0][1] for box in ocr_det_res]) + title_block['line_avg_height'] = round(avg_height/scale) + + text_blocks = magic_model.get_text_blocks() + interline_equation_blocks = magic_model.get_interline_equation_blocks() + + all_spans = magic_model.get_all_spans() + # 对image/table/interline_equation的span截图 + for span in all_spans: + if span["type"] in [ContentType.IMAGE, ContentType.TABLE, ContentType.INTERLINE_EQUATION]: + span = cut_image_and_table(span, page_pil_img, page_img_md5, page_index, image_writer, scale=scale) + + page_blocks = [] + page_blocks.extend([*image_blocks, *table_blocks, *title_blocks, *text_blocks, *interline_equation_blocks]) + # 对page_blocks根据index的值进行排序 + page_blocks.sort(key=lambda x: x["index"]) + + page_info = {"para_blocks": page_blocks, "discarded_blocks": [], "page_size": [width, height], "page_idx": page_index} + return page_info + + +def result_to_middle_json(token_list, images_list, pdf_doc, image_writer): + middle_json = {"pdf_info": [], "_backend":"vlm", "_version_name": __version__} + for index, token in enumerate(token_list): + page = pdf_doc[index] + image_dict = images_list[index] + page_info = token_to_page_info(token, image_dict, page, image_writer, index) + middle_json["pdf_info"].append(page_info) + + """llm优化""" + llm_aided_config = get_llm_aided_config() + + if llm_aided_config is not None: + """标题优化""" + title_aided_config = llm_aided_config.get('title_aided', None) + if title_aided_config is not None: + if title_aided_config.get('enable', False): + llm_aided_title_start_time = time.time() + llm_aided_title(middle_json["pdf_info"], title_aided_config) + logger.info(f'llm aided title time: {round(time.time() - llm_aided_title_start_time, 2)}') + + # 关闭pdf文档 + pdf_doc.close() + return middle_json + + +if __name__ == "__main__": + + output = r"<|box_start|>088 119 472 571<|box_end|><|ref_start|>image<|ref_end|><|md_start|>![]('img_url')<|md_end|>\n<|box_start|>079 582 482 608<|box_end|><|ref_start|>image_caption<|ref_end|><|md_start|>Fig. 2. (a) Schematic of the change in the FDC over time, and (b) definition of model parameters.<|md_end|>\n<|box_start|>079 624 285 638<|box_end|><|ref_start|>title<|ref_end|><|md_start|># 2.2. Zero flow day analysis<|md_end|>\n<|box_start|>079 656 482 801<|box_end|><|ref_start|>text<|ref_end|><|md_start|>A notable feature of Fig. 1 is the increase in the number of zero flow days. A similar approach to Eq. (2), using an inverse sigmoidal function was employed to assess the impact of afforestation on the number of zero flow days per year \((N_{\mathrm{zero}})\). In this case, the left hand side of Eq. (2) is replaced by \(N_{\mathrm{zero}}\) and \(b\) and \(S\) are constrained to negative as \(N_{\mathrm{zero}}\) decreases as rainfall increases, and increases with plantation growth:<|md_end|>\n<|box_start|>076 813 368 853<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nN_{\mathrm{zero}}=a+b(\Delta P)+\frac{Y}{1+\exp\left(\frac{T-T_{\mathrm{half}}}{S}\right)}\n\]<|md_end|>\n<|box_start|>079 865 482 895<|box_end|><|ref_start|>text<|ref_end|><|md_start|>For the average pre-treatment condition \(\Delta P=0\) and \(T=0\), \(N_{\mathrm{zero}}\) approximately equals \(a\). \(Y\) gives<|md_end|>\n<|box_start|>525 119 926 215<|box_end|><|ref_start|>text<|ref_end|><|md_start|>the magnitude of change in zero flow days due to afforestation, and \(S\) describes the shape of the response. For the average climate condition \(\Delta P=0\), \(a+Y\) becomes the number of zero flow days when the new equilibrium condition under afforestation is reached.<|md_end|>\n<|box_start|>525 240 704 253<|box_end|><|ref_start|>title<|ref_end|><|md_start|># 2.3. Statistical analyses<|md_end|>\n<|box_start|>525 271 926 368<|box_end|><|ref_start|>text<|ref_end|><|md_start|>The coefficient of efficiency \((E)\) (Nash and Sutcliffe, 1970; Chiew and McMahon, 1993; Legates and McCabe, 1999) was used as the 'goodness of fit' measure to evaluate the fit between observed and predicted flow deciles (2) and zero flow days (3). \(E\) is given by:<|md_end|>\n<|box_start|>520 375 735 415<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nE=1.0-\frac{\sum_{i=1}^{N}(O_{i}-P_{i})^{2}}{\sum_{i=1}^{N}(O_{i}-\bar{O})^{2}}\n\]<|md_end|>\n<|box_start|>525 424 926 601<|box_end|><|ref_start|>text<|ref_end|><|md_start|>where \(O\) are observed data, \(P\) are predicted values, and \(\bar{O}\) is the mean for the entire period. \(E\) is unity minus the ratio of the mean square error to the variance in the observed data, and ranges from \(-\infty\) to 1.0. Higher values indicate greater agreement between observed and predicted data as per the coefficient of determination \((r^{2})\). \(E\) is used in preference to \(r^{2}\) in evaluating hydrologic modelling because it is a measure of the deviation from the 1:1 line. As \(E\) is always \(0.7\) to indicate adequate model fits.<|md_end|>\n<|box_start|>525 603 926 731<|box_end|><|ref_start|>text<|ref_end|><|md_start|>It is important to assess the significance of the model parameters to check the model assumptions that rainfall and forest age are driving changes in the FDC. The model (2) was split into simplified forms, where only the rainfall or time terms were included by setting \(b=0\), as shown in Eq. (5), or \(Y=0\) as shown in Eq. (6). The component models (5) and (6) were then tested against the complete model, (2).<|md_end|>\n<|box_start|>520 739 735 778<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nQ_{\%}=a+\frac{Y}{1+\exp\left(\frac{T-T_{\mathrm{half}}^{\prime}}{S}\right)}\n\]<|md_end|>\n<|box_start|>525 787 553 799<|box_end|><|ref_start|>text<|ref_end|><|md_start|>and<|md_end|>\n<|box_start|>520 807 646 825<|box_end|><|ref_start|>equation<|ref_end|><|md_start|>\[\nQ_{\%}=a+b\Delta P\n\]<|md_end|>\n<|box_start|>525 833 926 895<|box_end|><|ref_start|>text<|ref_end|><|md_start|>For both the flow duration curve analysis and zero flow days analysis, a \(t\)-test was then performed to test whether (5) and (6) were significantly different to (2). A critical value of \(t\) exceeding the calculated \(t\)-value<|md_end|><|im_end|>" + + p_info = token_to_page_info(output) + # 将blocks 转换为json文本 + import json + + json_str = json.dumps(p_info, ensure_ascii=False, indent=4) + print(json_str) diff --git a/vendor/mineru/mineru/backend/vlm/utils.py b/vendor/mineru/mineru/backend/vlm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0783d17e03a60437c300b37773a18e26f61bfbd9 --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/utils.py @@ -0,0 +1,40 @@ +import os +import re +from base64 import b64decode + +import httpx + +_timeout = int(os.getenv("REQUEST_TIMEOUT", "3")) +_file_exts = (".png", ".jpg", ".jpeg", ".webp", ".gif", ".pdf") +_data_uri_regex = re.compile(r"^data:[^;,]+;base64,") + + +def load_resource(uri: str) -> bytes: + if uri.startswith("http://") or uri.startswith("https://"): + response = httpx.get(uri, timeout=_timeout) + return response.content + if uri.startswith("file://"): + with open(uri[len("file://") :], "rb") as file: + return file.read() + if uri.lower().endswith(_file_exts): + with open(uri, "rb") as file: + return file.read() + if re.match(_data_uri_regex, uri): + return b64decode(uri.split(",")[1]) + return b64decode(uri) + + +async def aio_load_resource(uri: str) -> bytes: + if uri.startswith("http://") or uri.startswith("https://"): + async with httpx.AsyncClient(timeout=_timeout) as client: + response = await client.get(uri) + return response.content + if uri.startswith("file://"): + with open(uri[len("file://") :], "rb") as file: + return file.read() + if uri.lower().endswith(_file_exts): + with open(uri, "rb") as file: + return file.read() + if re.match(_data_uri_regex, uri): + return b64decode(uri.split(",")[1]) + return b64decode(uri) diff --git a/vendor/mineru/mineru/backend/vlm/vlm_analyze.py b/vendor/mineru/mineru/backend/vlm/vlm_analyze.py new file mode 100644 index 0000000000000000000000000000000000000000..aedfa959b3f443e74fc3a246dc02d77fa531d1ba --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/vlm_analyze.py @@ -0,0 +1,93 @@ +# Copyright (c) Opendatalab. All rights reserved. +import time + +from loguru import logger + +from ...data.data_reader_writer import DataWriter +from mineru.utils.pdf_image_tools import load_images_from_pdf +from .base_predictor import BasePredictor +from .predictor import get_predictor +from .token_to_middle_json import result_to_middle_json +from ...utils.models_download_utils import auto_download_and_get_model_root_path + + +class ModelSingleton: + _instance = None + _models = {} + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def get_model( + self, + backend: str, + model_path: str | None, + server_url: str | None, + **kwargs, + ) -> BasePredictor: + key = (backend, model_path, server_url) + if key not in self._models: + if backend in ['transformers', 'sglang-engine'] and not model_path: + model_path = auto_download_and_get_model_root_path("/","vlm") + self._models[key] = get_predictor( + backend=backend, + model_path=model_path, + server_url=server_url, + **kwargs, + ) + return self._models[key] + + +def doc_analyze( + pdf_bytes, + image_writer: DataWriter | None, + predictor: BasePredictor | None = None, + backend="transformers", + model_path: str | None = None, + server_url: str | None = None, + **kwargs, +): + if predictor is None: + predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs) + + # load_images_start = time.time() + images_list, pdf_doc = load_images_from_pdf(pdf_bytes) + images_base64_list = [image_dict["img_base64"] for image_dict in images_list] + # load_images_time = round(time.time() - load_images_start, 2) + # logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s") + + # infer_start = time.time() + results = predictor.batch_predict(images=images_base64_list) + # infer_time = round(time.time() - infer_start, 2) + # logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s") + + middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer) + return middle_json, results + + +async def aio_doc_analyze( + pdf_bytes, + image_writer: DataWriter | None, + predictor: BasePredictor | None = None, + backend="transformers", + model_path: str | None = None, + server_url: str | None = None, + **kwargs, +): + if predictor is None: + predictor = ModelSingleton().get_model(backend, model_path, server_url, **kwargs) + + # load_images_start = time.time() + images_list, pdf_doc = load_images_from_pdf(pdf_bytes) + images_base64_list = [image_dict["img_base64"] for image_dict in images_list] + # load_images_time = round(time.time() - load_images_start, 2) + # logger.info(f"load images cost: {load_images_time}, speed: {round(len(images_base64_list)/load_images_time, 3)} images/s") + + # infer_start = time.time() + results = await predictor.aio_batch_predict(images=images_base64_list) + # infer_time = round(time.time() - infer_start, 2) + # logger.info(f"infer finished, cost: {infer_time}, speed: {round(len(results)/infer_time, 3)} page/s") + middle_json = result_to_middle_json(results, images_list, pdf_doc, image_writer) + return middle_json, results diff --git a/vendor/mineru/mineru/backend/vlm/vlm_magic_model.py b/vendor/mineru/mineru/backend/vlm/vlm_magic_model.py new file mode 100644 index 0000000000000000000000000000000000000000..bcf16f8fa93bcd301c7b2c3f191ce55c46076d3f --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/vlm_magic_model.py @@ -0,0 +1,521 @@ +import re +from typing import Literal + +from loguru import logger + +from mineru.utils.boxbase import bbox_distance, is_in +from mineru.utils.enum_class import ContentType, BlockType, SplitFlag +from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text +from mineru.utils.format_utils import convert_otsl_to_html + + +class MagicModel: + def __init__(self, token: str, width, height): + self.token = token + + # 使用正则表达式查找所有块 + pattern = ( + r"<\|box_start\|>(.*?)<\|box_end\|><\|ref_start\|>(.*?)<\|ref_end\|><\|md_start\|>(.*?)(?:<\|md_end\|>|<\|im_end\|>)" + ) + block_infos = re.findall(pattern, token, re.DOTALL) + + blocks = [] + self.all_spans = [] + # 解析每个块 + for index, block_info in enumerate(block_infos): + block_bbox = block_info[0].strip() + try: + x1, y1, x2, y2 = map(int, block_bbox.split()) + x_1, y_1, x_2, y_2 = ( + int(x1 * width / 1000), + int(y1 * height / 1000), + int(x2 * width / 1000), + int(y2 * height / 1000), + ) + if x_2 < x_1: + x_1, x_2 = x_2, x_1 + if y_2 < y_1: + y_1, y_2 = y_2, y_1 + block_bbox = (x_1, y_1, x_2, y_2) + block_type = block_info[1].strip() + block_content = block_info[2].strip() + + # print(f"坐标: {block_bbox}") + # print(f"类型: {block_type}") + # print(f"内容: {block_content}") + # print("-" * 50) + except Exception as e: + # 如果解析失败,可能是因为格式不正确,跳过这个块 + logger.warning(f"Invalid block format: {block_info}, error: {e}") + continue + + span_type = "unknown" + if block_type in [ + "text", + "title", + "image_caption", + "image_footnote", + "table_caption", + "table_footnote", + "list", + "index", + ]: + span_type = ContentType.TEXT + elif block_type in ["image"]: + block_type = BlockType.IMAGE_BODY + span_type = ContentType.IMAGE + elif block_type in ["table"]: + block_type = BlockType.TABLE_BODY + span_type = ContentType.TABLE + elif block_type in ["equation"]: + block_type = BlockType.INTERLINE_EQUATION + span_type = ContentType.INTERLINE_EQUATION + + if span_type in ["image", "table"]: + span = { + "bbox": block_bbox, + "type": span_type, + } + if span_type == ContentType.TABLE: + if "" in block_content or "" in block_content: + lines = block_content.split("\n\n") + new_lines = [] + for line in lines: + if "" in line or "" in line: + line = convert_otsl_to_html(line) + new_lines.append(line) + span["html"] = "\n\n".join(new_lines) + else: + span["html"] = block_content + elif span_type in [ContentType.INTERLINE_EQUATION]: + span = { + "bbox": block_bbox, + "type": span_type, + "content": isolated_formula_clean(block_content), + } + else: + if block_content.count("\\(") == block_content.count("\\)") and block_content.count("\\(") > 0: + # 生成包含文本和公式的span列表 + spans = [] + last_end = 0 + + # 查找所有公式 + for match in re.finditer(r'\\\((.+?)\\\)', block_content): + start, end = match.span() + + # 添加公式前的文本 + if start > last_end: + text_before = block_content[last_end:start] + if text_before.strip(): + spans.append({ + "bbox": block_bbox, + "type": ContentType.TEXT, + "content": text_before + }) + + # 添加公式(去除\(和\)) + formula = match.group(1) + spans.append({ + "bbox": block_bbox, + "type": ContentType.INLINE_EQUATION, + "content": formula.strip() + }) + + last_end = end + + # 添加最后一个公式后的文本 + if last_end < len(block_content): + text_after = block_content[last_end:] + if text_after.strip(): + spans.append({ + "bbox": block_bbox, + "type": ContentType.TEXT, + "content": text_after + }) + + span = spans + else: + span = { + "bbox": block_bbox, + "type": span_type, + "content": block_content, + } + + if isinstance(span, dict) and "bbox" in span: + self.all_spans.append(span) + line = { + "bbox": block_bbox, + "spans": [span], + } + elif isinstance(span, list): + self.all_spans.extend(span) + line = { + "bbox": block_bbox, + "spans": span, + } + else: + raise ValueError(f"Invalid span type: {span_type}, expected dict or list, got {type(span)}") + + blocks.append( + { + "bbox": block_bbox, + "type": block_type, + "lines": [line], + "index": index, + } + ) + + self.image_blocks = [] + self.table_blocks = [] + self.interline_equation_blocks = [] + self.text_blocks = [] + self.title_blocks = [] + for block in blocks: + if block["type"] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]: + self.image_blocks.append(block) + elif block["type"] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]: + self.table_blocks.append(block) + elif block["type"] == BlockType.INTERLINE_EQUATION: + self.interline_equation_blocks.append(block) + elif block["type"] == BlockType.TEXT: + self.text_blocks.append(block) + elif block["type"] == BlockType.TITLE: + self.title_blocks.append(block) + else: + continue + + def get_image_blocks(self): + return fix_two_layer_blocks(self.image_blocks, BlockType.IMAGE) + + def get_table_blocks(self): + return fix_two_layer_blocks(self.table_blocks, BlockType.TABLE) + + def get_title_blocks(self): + return fix_title_blocks(self.title_blocks) + + def get_text_blocks(self): + return fix_text_blocks(self.text_blocks) + + def get_interline_equation_blocks(self): + return self.interline_equation_blocks + + def get_all_spans(self): + return self.all_spans + + +def isolated_formula_clean(txt): + latex = txt[:] + if latex.startswith("\\["): latex = latex[2:] + if latex.endswith("\\]"): latex = latex[:-2] + latex = latex_fix(latex.strip()) + return latex + + +def latex_fix(latex): + # valid pairs: + # \left\{ ... \right\} + # \left( ... \right) + # \left| ... \right| + # \left\| ... \right\| + # \left[ ... \right] + + LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])') + RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])') + left_count = len(LEFT_COUNT_PATTERN.findall(latex)) # 不匹配\lefteqn等 + right_count = len(RIGHT_COUNT_PATTERN.findall(latex)) # 不匹配\rightarrow + + if left_count != right_count: + for _ in range(2): + # replace valid pairs + latex = re.sub(r'\\left\\\{', "{", latex) # \left\{ + latex = re.sub(r"\\left\|", "|", latex) # \left| + latex = re.sub(r"\\left\\\|", "|", latex) # \left\| + latex = re.sub(r"\\left\(", "(", latex) # \left( + latex = re.sub(r"\\left\[", "[", latex) # \left[ + + latex = re.sub(r"\\right\\\}", "}", latex) # \right\} + latex = re.sub(r"\\right\|", "|", latex) # \right| + latex = re.sub(r"\\right\\\|", "|", latex) # \right\| + latex = re.sub(r"\\right\)", ")", latex) # \right) + latex = re.sub(r"\\right\]", "]", latex) # \right] + latex = re.sub(r"\\right\.", "", latex) # \right. + + # replace invalid pairs first + latex = re.sub(r'\\left\{', "{", latex) + latex = re.sub(r'\\right\}', "}", latex) # \left{ ... \right} + latex = re.sub(r'\\left\\\(', "(", latex) + latex = re.sub(r'\\right\\\)', ")", latex) # \left\( ... \right\) + latex = re.sub(r'\\left\\\[', "[", latex) + latex = re.sub(r'\\right\\\]', "]", latex) # \left\[ ... \right\] + + return latex + + +def __reduct_overlap(bboxes): + N = len(bboxes) + keep = [True] * N + for i in range(N): + for j in range(N): + if i == j: + continue + if is_in(bboxes[i]["bbox"], bboxes[j]["bbox"]): + keep[i] = False + return [bboxes[i] for i in range(N) if keep[i]] + + +def __tie_up_category_by_distance_v3( + blocks: list, + subject_block_type: str, + object_block_type: str, +): + subjects = __reduct_overlap( + list( + map( + lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]}, + filter( + lambda x: x["type"] == subject_block_type, + blocks, + ), + ) + ) + ) + objects = __reduct_overlap( + list( + map( + lambda x: {"bbox": x["bbox"], "lines": x["lines"], "index": x["index"]}, + filter( + lambda x: x["type"] == object_block_type, + blocks, + ), + ) + ) + ) + + ret = [] + N, M = len(subjects), len(objects) + subjects.sort(key=lambda x: x["bbox"][0] ** 2 + x["bbox"][1] ** 2) + objects.sort(key=lambda x: x["bbox"][0] ** 2 + x["bbox"][1] ** 2) + + OBJ_IDX_OFFSET = 10000 + SUB_BIT_KIND, OBJ_BIT_KIND = 0, 1 + + all_boxes_with_idx = [(i, SUB_BIT_KIND, sub["bbox"][0], sub["bbox"][1]) for i, sub in enumerate(subjects)] + [ + (i + OBJ_IDX_OFFSET, OBJ_BIT_KIND, obj["bbox"][0], obj["bbox"][1]) for i, obj in enumerate(objects) + ] + seen_idx = set() + seen_sub_idx = set() + + while N > len(seen_sub_idx): + candidates = [] + for idx, kind, x0, y0 in all_boxes_with_idx: + if idx in seen_idx: + continue + candidates.append((idx, kind, x0, y0)) + + if len(candidates) == 0: + break + left_x = min([v[2] for v in candidates]) + top_y = min([v[3] for v in candidates]) + + candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y) ** 2) + + fst_idx, fst_kind, left_x, top_y = candidates[0] + candidates.sort(key=lambda x: (x[2] - left_x) ** 2 + (x[3] - top_y) ** 2) + nxt = None + + for i in range(1, len(candidates)): + if candidates[i][1] ^ fst_kind == 1: + nxt = candidates[i] + break + if nxt is None: + break + + if fst_kind == SUB_BIT_KIND: + sub_idx, obj_idx = fst_idx, nxt[0] - OBJ_IDX_OFFSET + + else: + sub_idx, obj_idx = nxt[0], fst_idx - OBJ_IDX_OFFSET + + pair_dis = bbox_distance(subjects[sub_idx]["bbox"], objects[obj_idx]["bbox"]) + nearest_dis = float("inf") + for i in range(N): + if i in seen_idx or i == sub_idx: + continue + nearest_dis = min(nearest_dis, bbox_distance(subjects[i]["bbox"], objects[obj_idx]["bbox"])) + + if pair_dis >= 3 * nearest_dis: + seen_idx.add(sub_idx) + continue + + seen_idx.add(sub_idx) + seen_idx.add(obj_idx + OBJ_IDX_OFFSET) + seen_sub_idx.add(sub_idx) + + ret.append( + { + "sub_bbox": { + "bbox": subjects[sub_idx]["bbox"], + "lines": subjects[sub_idx]["lines"], + "index": subjects[sub_idx]["index"], + }, + "obj_bboxes": [ + {"bbox": objects[obj_idx]["bbox"], "lines": objects[obj_idx]["lines"], "index": objects[obj_idx]["index"]} + ], + "sub_idx": sub_idx, + } + ) + + for i in range(len(objects)): + j = i + OBJ_IDX_OFFSET + if j in seen_idx: + continue + seen_idx.add(j) + nearest_dis, nearest_sub_idx = float("inf"), -1 + for k in range(len(subjects)): + dis = bbox_distance(objects[i]["bbox"], subjects[k]["bbox"]) + if dis < nearest_dis: + nearest_dis = dis + nearest_sub_idx = k + + for k in range(len(subjects)): + if k != nearest_sub_idx: + continue + if k in seen_sub_idx: + for kk in range(len(ret)): + if ret[kk]["sub_idx"] == k: + ret[kk]["obj_bboxes"].append( + {"bbox": objects[i]["bbox"], "lines": objects[i]["lines"], "index": objects[i]["index"]} + ) + break + else: + ret.append( + { + "sub_bbox": { + "bbox": subjects[k]["bbox"], + "lines": subjects[k]["lines"], + "index": subjects[k]["index"], + }, + "obj_bboxes": [ + {"bbox": objects[i]["bbox"], "lines": objects[i]["lines"], "index": objects[i]["index"]} + ], + "sub_idx": k, + } + ) + seen_sub_idx.add(k) + seen_idx.add(k) + + for i in range(len(subjects)): + if i in seen_sub_idx: + continue + ret.append( + { + "sub_bbox": { + "bbox": subjects[i]["bbox"], + "lines": subjects[i]["lines"], + "index": subjects[i]["index"], + }, + "obj_bboxes": [], + "sub_idx": i, + } + ) + + return ret + + +def get_type_blocks(blocks, block_type: Literal["image", "table"]): + with_captions = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_caption") + with_footnotes = __tie_up_category_by_distance_v3(blocks, f"{block_type}_body", f"{block_type}_footnote") + ret = [] + for v in with_captions: + record = { + f"{block_type}_body": v["sub_bbox"], + f"{block_type}_caption_list": v["obj_bboxes"], + } + filter_idx = v["sub_idx"] + d = next(filter(lambda x: x["sub_idx"] == filter_idx, with_footnotes)) + record[f"{block_type}_footnote_list"] = d["obj_bboxes"] + ret.append(record) + return ret + + +def fix_two_layer_blocks(blocks, fix_type: Literal["image", "table"]): + need_fix_blocks = get_type_blocks(blocks, fix_type) + fixed_blocks = [] + for block in need_fix_blocks: + body = block[f"{fix_type}_body"] + caption_list = block[f"{fix_type}_caption_list"] + footnote_list = block[f"{fix_type}_footnote_list"] + + body["type"] = f"{fix_type}_body" + for caption in caption_list: + caption["type"] = f"{fix_type}_caption" + for footnote in footnote_list: + footnote["type"] = f"{fix_type}_footnote" + + two_layer_block = { + "type": fix_type, + "bbox": body["bbox"], + "blocks": [ + body, + ], + "index": body["index"], + } + two_layer_block["blocks"].extend([*caption_list, *footnote_list]) + + fixed_blocks.append(two_layer_block) + + return fixed_blocks + + +def fix_title_blocks(blocks): + for block in blocks: + if block["type"] == BlockType.TITLE: + title_content = merge_para_with_text(block) + title_level = count_leading_hashes(title_content) + block['level'] = title_level + for line in block['lines']: + for span in line['spans']: + span['content'] = strip_leading_hashes(span['content']) + break + break + return blocks + + +def count_leading_hashes(text): + match = re.match(r'^(#+)', text) + return len(match.group(1)) if match else 0 + + +def strip_leading_hashes(text): + # 去除开头的#和紧随其后的空格 + return re.sub(r'^#+\s*', '', text) + + +def fix_text_blocks(blocks): + i = 0 + while i < len(blocks): + block = blocks[i] + last_line = block["lines"][-1]if block["lines"] else None + if last_line: + last_span = last_line["spans"][-1] if last_line["spans"] else None + if last_span and last_span['content'].endswith('<|txt_contd|>'): + last_span['content'] = last_span['content'][:-len('<|txt_contd|>')] + + # 查找下一个未被清空的块 + next_idx = i + 1 + while next_idx < len(blocks) and blocks[next_idx].get(SplitFlag.LINES_DELETED, False): + next_idx += 1 + + # 如果找到下一个有效块,则合并 + if next_idx < len(blocks): + next_block = blocks[next_idx] + # 将下一个块的lines扩展到当前块的lines中 + block["lines"].extend(next_block["lines"]) + # 清空下一个块的lines + next_block["lines"] = [] + # 在下一个块中添加标志 + next_block[SplitFlag.LINES_DELETED] = True + # 不增加i,继续检查当前块(现在已包含下一个块的内容) + continue + i += 1 + return blocks \ No newline at end of file diff --git a/vendor/mineru/mineru/backend/vlm/vlm_middle_json_mkcontent.py b/vendor/mineru/mineru/backend/vlm/vlm_middle_json_mkcontent.py new file mode 100644 index 0000000000000000000000000000000000000000..ecd070a1beab2b7a2f52e094c4c80942ae7ed8d0 --- /dev/null +++ b/vendor/mineru/mineru/backend/vlm/vlm_middle_json_mkcontent.py @@ -0,0 +1,221 @@ +import os + +from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable +from mineru.utils.enum_class import MakeMode, BlockType, ContentType + + +latex_delimiters_config = get_latex_delimiter_config() + +default_delimiters = { + 'display': {'left': '$$', 'right': '$$'}, + 'inline': {'left': '$', 'right': '$'} +} + +delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters + +display_left_delimiter = delimiters['display']['left'] +display_right_delimiter = delimiters['display']['right'] +inline_left_delimiter = delimiters['inline']['left'] +inline_right_delimiter = delimiters['inline']['right'] + +def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''): + para_text = '' + for line in para_block['lines']: + for j, span in enumerate(line['spans']): + span_type = span['type'] + content = '' + if span_type == ContentType.TEXT: + content = span['content'] + elif span_type == ContentType.INLINE_EQUATION: + content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}" + elif span_type == ContentType.INTERLINE_EQUATION: + if formula_enable: + content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n" + else: + if span.get('image_path', ''): + content = f"![]({img_buket_path}/{span['image_path']})" + # content = content.strip() + if content: + if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]: + if j == len(line['spans']) - 1: + para_text += content + else: + para_text += f'{content} ' + elif span_type == ContentType.INTERLINE_EQUATION: + para_text += content + return para_text + +def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''): + page_markdown = [] + for para_block in para_blocks: + para_text = '' + para_type = para_block['type'] + if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX, BlockType.INTERLINE_EQUATION]: + para_text = merge_para_with_text(para_block, formula_enable=formula_enable, img_buket_path=img_buket_path) + elif para_type == BlockType.TITLE: + title_level = get_title_level(para_block) + para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}' + elif para_type == BlockType.IMAGE: + if make_mode == MakeMode.NLP_MD: + continue + elif make_mode == MakeMode.MM_MD: + # 检测是否存在图片脚注 + has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks']) + # 如果存在图片脚注,则将图片脚注拼接到图片正文后面 + if has_image_footnote: + for block in para_block['blocks']: # 1st.拼image_caption + if block['type'] == BlockType.IMAGE_CAPTION: + para_text += merge_para_with_text(block) + ' \n' + for block in para_block['blocks']: # 2nd.拼image_body + if block['type'] == BlockType.IMAGE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.IMAGE: + if span.get('image_path', ''): + para_text += f"![]({img_buket_path}/{span['image_path']})" + for block in para_block['blocks']: # 3rd.拼image_footnote + if block['type'] == BlockType.IMAGE_FOOTNOTE: + para_text += ' \n' + merge_para_with_text(block) + else: + for block in para_block['blocks']: # 1st.拼image_body + if block['type'] == BlockType.IMAGE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.IMAGE: + if span.get('image_path', ''): + para_text += f"![]({img_buket_path}/{span['image_path']})" + for block in para_block['blocks']: # 2nd.拼image_caption + if block['type'] == BlockType.IMAGE_CAPTION: + para_text += ' \n' + merge_para_with_text(block) + + elif para_type == BlockType.TABLE: + if make_mode == MakeMode.NLP_MD: + continue + elif make_mode == MakeMode.MM_MD: + for block in para_block['blocks']: # 1st.拼table_caption + if block['type'] == BlockType.TABLE_CAPTION: + para_text += merge_para_with_text(block) + ' \n' + for block in para_block['blocks']: # 2nd.拼table_body + if block['type'] == BlockType.TABLE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.TABLE: + # if processed by table model + if table_enable: + if span.get('html', ''): + para_text += f"\n{span['html']}\n" + elif span.get('image_path', ''): + para_text += f"![]({img_buket_path}/{span['image_path']})" + else: + if span.get('image_path', ''): + para_text += f"![]({img_buket_path}/{span['image_path']})" + for block in para_block['blocks']: # 3rd.拼table_footnote + if block['type'] == BlockType.TABLE_FOOTNOTE: + para_text += '\n' + merge_para_with_text(block) + ' ' + + if para_text.strip() == '': + continue + else: + # page_markdown.append(para_text.strip() + ' ') + page_markdown.append(para_text.strip()) + + return page_markdown + + + + + +def make_blocks_to_content_list(para_block, img_buket_path, page_idx): + para_type = para_block['type'] + para_content = {} + if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]: + para_content = { + 'type': ContentType.TEXT, + 'text': merge_para_with_text(para_block), + } + elif para_type == BlockType.TITLE: + title_level = get_title_level(para_block) + para_content = { + 'type': ContentType.TEXT, + 'text': merge_para_with_text(para_block), + } + if title_level != 0: + para_content['text_level'] = title_level + elif para_type == BlockType.INTERLINE_EQUATION: + para_content = { + 'type': ContentType.EQUATION, + 'text': merge_para_with_text(para_block), + 'text_format': 'latex', + } + elif para_type == BlockType.IMAGE: + para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []} + for block in para_block['blocks']: + if block['type'] == BlockType.IMAGE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.IMAGE: + if span.get('image_path', ''): + para_content['img_path'] = f"{img_buket_path}/{span['image_path']}" + if block['type'] == BlockType.IMAGE_CAPTION: + para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block)) + if block['type'] == BlockType.IMAGE_FOOTNOTE: + para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block)) + elif para_type == BlockType.TABLE: + para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []} + for block in para_block['blocks']: + if block['type'] == BlockType.TABLE_BODY: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.TABLE: + + if span.get('html', ''): + para_content[BlockType.TABLE_BODY] = f"{span['html']}" + + if span.get('image_path', ''): + para_content['img_path'] = f"{img_buket_path}/{span['image_path']}" + + if block['type'] == BlockType.TABLE_CAPTION: + para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block)) + if block['type'] == BlockType.TABLE_FOOTNOTE: + para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block)) + + para_content['page_idx'] = page_idx + + return para_content + +def union_make(pdf_info_dict: list, + make_mode: str, + img_buket_path: str = '', + ): + + formula_enable = get_formula_enable(os.getenv('MINERU_VLM_FORMULA_ENABLE', 'True').lower() == 'true') + table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true') + + output_content = [] + for page_info in pdf_info_dict: + paras_of_layout = page_info.get('para_blocks') + page_idx = page_info.get('page_idx') + if not paras_of_layout: + continue + if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: + page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path) + output_content.extend(page_markdown) + elif make_mode == MakeMode.CONTENT_LIST: + for para_block in paras_of_layout: + para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx) + output_content.append(para_content) + + if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: + return '\n\n'.join(output_content) + elif make_mode == MakeMode.CONTENT_LIST: + return output_content + return None + + +def get_title_level(block): + title_level = block.get('level', 1) + if title_level > 4: + title_level = 4 + elif title_level < 1: + title_level = 0 + return title_level diff --git a/vendor/mineru/mineru/cli/__init__.py b/vendor/mineru/mineru/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/cli/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/cli/client.py b/vendor/mineru/mineru/cli/client.py new file mode 100644 index 0000000000000000000000000000000000000000..57b568331d3681b6396515c9142afc048d8a2ccf --- /dev/null +++ b/vendor/mineru/mineru/cli/client.py @@ -0,0 +1,212 @@ +# Copyright (c) Opendatalab. All rights reserved. +import os +import click +from pathlib import Path +from loguru import logger + +from mineru.utils.cli_parser import arg_parse +from mineru.utils.config_reader import get_device +from mineru.utils.model_utils import get_vram +from ..version import __version__ +from .common import do_parse, read_fn, pdf_suffixes, image_suffixes + +@click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) +@click.pass_context +@click.version_option(__version__, + '--version', + '-v', + help='display the version and exit') +@click.option( + '-p', + '--path', + 'input_path', + type=click.Path(exists=True), + required=True, + help='local filepath or directory. support pdf, png, jpg, jpeg files', +) +@click.option( + '-o', + '--output', + 'output_dir', + type=click.Path(), + required=True, + help='output local directory', +) +@click.option( + '-m', + '--method', + 'method', + type=click.Choice(['auto', 'txt', 'ocr']), + help="""the method for parsing pdf: + auto: Automatically determine the method based on the file type. + txt: Use text extraction method. + ocr: Use OCR method for image-based PDFs. + Without method specified, 'auto' will be used by default. + Adapted only for the case where the backend is set to "pipeline".""", + default='auto', +) +@click.option( + '-b', + '--backend', + 'backend', + type=click.Choice(['pipeline', 'vlm-transformers', 'vlm-sglang-engine', 'vlm-sglang-client']), + help="""the backend for parsing pdf: + pipeline: More general. + vlm-transformers: More general. + vlm-sglang-engine: Faster(engine). + vlm-sglang-client: Faster(client). + without method specified, pipeline will be used by default.""", + default='pipeline', +) +@click.option( + '-l', + '--lang', + 'lang', + type=click.Choice(['ch', 'ch_server', 'ch_lite', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka', + 'latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari']), + help=""" + Input the languages in the pdf (if known) to improve OCR accuracy. Optional. + Without languages specified, 'ch' will be used by default. + Adapted only for the case where the backend is set to "pipeline". + """, + default='ch', +) +@click.option( + '-u', + '--url', + 'server_url', + type=str, + help=""" + When the backend is `sglang-client`, you need to specify the server_url, for example:`http://127.0.0.1:30000` + """, + default=None, +) +@click.option( + '-s', + '--start', + 'start_page_id', + type=int, + help='The starting page for PDF parsing, beginning from 0.', + default=0, +) +@click.option( + '-e', + '--end', + 'end_page_id', + type=int, + help='The ending page for PDF parsing, beginning from 0.', + default=None, +) +@click.option( + '-f', + '--formula', + 'formula_enable', + type=bool, + help='Enable formula parsing. Default is True. Adapted only for the case where the backend is set to "pipeline".', + default=True, +) +@click.option( + '-t', + '--table', + 'table_enable', + type=bool, + help='Enable table parsing. Default is True. Adapted only for the case where the backend is set to "pipeline".', + default=True, +) +@click.option( + '-d', + '--device', + 'device_mode', + type=str, + help='Device mode for model inference, e.g., "cpu", "cuda", "cuda:0", "npu", "npu:0", "mps". Adapted only for the case where the backend is set to "pipeline". ', + default=None, +) +@click.option( + '--vram', + 'virtual_vram', + type=int, + help='Upper limit of GPU memory occupied by a single process. Adapted only for the case where the backend is set to "pipeline". ', + default=None, +) +@click.option( + '--source', + 'model_source', + type=click.Choice(['huggingface', 'modelscope', 'local']), + help=""" + The source of the model repository. Default is 'huggingface'. + """, + default='huggingface', +) + + +def main( + ctx, + input_path, output_dir, method, backend, lang, server_url, + start_page_id, end_page_id, formula_enable, table_enable, + device_mode, virtual_vram, model_source, **kwargs +): + + kwargs.update(arg_parse(ctx)) + + if not backend.endswith('-client'): + def get_device_mode() -> str: + if device_mode is not None: + return device_mode + else: + return get_device() + if os.getenv('MINERU_DEVICE_MODE', None) is None: + os.environ['MINERU_DEVICE_MODE'] = get_device_mode() + + def get_virtual_vram_size() -> int: + if virtual_vram is not None: + return virtual_vram + if get_device_mode().startswith("cuda") or get_device_mode().startswith("npu"): + return round(get_vram(get_device_mode())) + return 1 + if os.getenv('MINERU_VIRTUAL_VRAM_SIZE', None) is None: + os.environ['MINERU_VIRTUAL_VRAM_SIZE']= str(get_virtual_vram_size()) + + if os.getenv('MINERU_MODEL_SOURCE', None) is None: + os.environ['MINERU_MODEL_SOURCE'] = model_source + + os.makedirs(output_dir, exist_ok=True) + + def parse_doc(path_list: list[Path]): + try: + file_name_list = [] + pdf_bytes_list = [] + lang_list = [] + for path in path_list: + file_name = str(Path(path).stem) + pdf_bytes = read_fn(path) + file_name_list.append(file_name) + pdf_bytes_list.append(pdf_bytes) + lang_list.append(lang) + do_parse( + output_dir=output_dir, + pdf_file_names=file_name_list, + pdf_bytes_list=pdf_bytes_list, + p_lang_list=lang_list, + backend=backend, + parse_method=method, + formula_enable=formula_enable, + table_enable=table_enable, + server_url=server_url, + start_page_id=start_page_id, + end_page_id=end_page_id, + **kwargs, + ) + except Exception as e: + logger.exception(e) + + if os.path.isdir(input_path): + doc_path_list = [] + for doc_path in Path(input_path).glob('*'): + if doc_path.suffix in pdf_suffixes + image_suffixes: + doc_path_list.append(doc_path) + parse_doc(doc_path_list) + else: + parse_doc([Path(input_path)]) + +if __name__ == '__main__': + main() diff --git a/vendor/mineru/mineru/cli/common.py b/vendor/mineru/mineru/cli/common.py new file mode 100644 index 0000000000000000000000000000000000000000..cd9f0803eb36c009159ad0482bca62c00c81ad0f --- /dev/null +++ b/vendor/mineru/mineru/cli/common.py @@ -0,0 +1,403 @@ +# Copyright (c) Opendatalab. All rights reserved. +import io +import json +import os +import copy +from pathlib import Path + +import pypdfium2 as pdfium +from loguru import logger + +from mineru.data.data_reader_writer import FileBasedDataWriter +from mineru.utils.draw_bbox import draw_layout_bbox, draw_span_bbox +from mineru.utils.enum_class import MakeMode +from mineru.utils.pdf_image_tools import images_bytes_to_pdf_bytes +from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make +from mineru.backend.vlm.vlm_analyze import doc_analyze as vlm_doc_analyze +from mineru.backend.vlm.vlm_analyze import aio_doc_analyze as aio_vlm_doc_analyze + +pdf_suffixes = [".pdf"] +image_suffixes = [".png", ".jpeg", ".jpg", ".webp", ".gif"] + + +def read_fn(path): + if not isinstance(path, Path): + path = Path(path) + with open(str(path), "rb") as input_file: + file_bytes = input_file.read() + if path.suffix in image_suffixes: + return images_bytes_to_pdf_bytes(file_bytes) + elif path.suffix in pdf_suffixes: + return file_bytes + else: + raise Exception(f"Unknown file suffix: {path.suffix}") + + +def prepare_env(output_dir, pdf_file_name, parse_method): + local_md_dir = str(os.path.join(output_dir, pdf_file_name, parse_method)) + local_image_dir = os.path.join(str(local_md_dir), "images") + os.makedirs(local_image_dir, exist_ok=True) + os.makedirs(local_md_dir, exist_ok=True) + return local_image_dir, local_md_dir + + +def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None): + + # 从字节数据加载PDF + pdf = pdfium.PdfDocument(pdf_bytes) + + # 确定结束页 + end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else len(pdf) - 1 + if end_page_id > len(pdf) - 1: + logger.warning("end_page_id is out of range, use pdf_docs length") + end_page_id = len(pdf) - 1 + + # 创建一个新的PDF文档 + output_pdf = pdfium.PdfDocument.new() + + # 选择要导入的页面索引 + page_indices = list(range(start_page_id, end_page_id + 1)) + + # 从原PDF导入页面到新PDF + output_pdf.import_pages(pdf, page_indices) + + # 将新PDF保存到内存缓冲区 + output_buffer = io.BytesIO() + output_pdf.save(output_buffer) + + # 获取字节数据 + output_bytes = output_buffer.getvalue() + + pdf.close() # 关闭原PDF文档以释放资源 + output_pdf.close() # 关闭新PDF文档以释放资源 + + return output_bytes + + +def _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id): + """准备处理PDF字节数据""" + result = [] + for pdf_bytes in pdf_bytes_list: + new_pdf_bytes = convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id, end_page_id) + result.append(new_pdf_bytes) + return result + + +def _process_output( + pdf_info, + pdf_bytes, + pdf_file_name, + local_md_dir, + local_image_dir, + md_writer, + f_draw_layout_bbox, + f_draw_span_bbox, + f_dump_orig_pdf, + f_dump_md, + f_dump_content_list, + f_dump_middle_json, + f_dump_model_output, + f_make_md_mode, + middle_json, + model_output=None, + is_pipeline=True +): + from mineru.backend.pipeline.pipeline_middle_json_mkcontent import union_make as pipeline_union_make + """处理输出文件""" + if f_draw_layout_bbox: + draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_layout.pdf") + + if f_draw_span_bbox: + draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, f"{pdf_file_name}_span.pdf") + + if f_dump_orig_pdf: + md_writer.write( + f"{pdf_file_name}_origin.pdf", + pdf_bytes, + ) + + image_dir = str(os.path.basename(local_image_dir)) + + if f_dump_md: + make_func = pipeline_union_make if is_pipeline else vlm_union_make + md_content_str = make_func(pdf_info, f_make_md_mode, image_dir) + md_writer.write_string( + f"{pdf_file_name}.md", + md_content_str, + ) + + if f_dump_content_list: + make_func = pipeline_union_make if is_pipeline else vlm_union_make + content_list = make_func(pdf_info, MakeMode.CONTENT_LIST, image_dir) + md_writer.write_string( + f"{pdf_file_name}_content_list.json", + json.dumps(content_list, ensure_ascii=False, indent=4), + ) + + if f_dump_middle_json: + md_writer.write_string( + f"{pdf_file_name}_middle.json", + json.dumps(middle_json, ensure_ascii=False, indent=4), + ) + + if f_dump_model_output: + if is_pipeline: + md_writer.write_string( + f"{pdf_file_name}_model.json", + json.dumps(model_output, ensure_ascii=False, indent=4), + ) + else: + output_text = ("\n" + "-" * 50 + "\n").join(model_output) + md_writer.write_string( + f"{pdf_file_name}_model_output.txt", + output_text, + ) + + logger.info(f"local output dir is {local_md_dir}") + + +def _process_pipeline( + output_dir, + pdf_file_names, + pdf_bytes_list, + p_lang_list, + parse_method, + p_formula_enable, + p_table_enable, + f_draw_layout_bbox, + f_draw_span_bbox, + f_dump_md, + f_dump_middle_json, + f_dump_model_output, + f_dump_orig_pdf, + f_dump_content_list, + f_make_md_mode, +): + """处理pipeline后端逻辑""" + from mineru.backend.pipeline.model_json_to_middle_json import result_to_middle_json as pipeline_result_to_middle_json + from mineru.backend.pipeline.pipeline_analyze import doc_analyze as pipeline_doc_analyze + + infer_results, all_image_lists, all_pdf_docs, lang_list, ocr_enabled_list = ( + pipeline_doc_analyze( + pdf_bytes_list, p_lang_list, parse_method=parse_method, + formula_enable=p_formula_enable, table_enable=p_table_enable + ) + ) + + for idx, model_list in enumerate(infer_results): + model_json = copy.deepcopy(model_list) + pdf_file_name = pdf_file_names[idx] + local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method) + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) + + images_list = all_image_lists[idx] + pdf_doc = all_pdf_docs[idx] + _lang = lang_list[idx] + _ocr_enable = ocr_enabled_list[idx] + + middle_json = pipeline_result_to_middle_json( + model_list, images_list, pdf_doc, image_writer, + _lang, _ocr_enable, p_formula_enable + ) + + pdf_info = middle_json["pdf_info"] + pdf_bytes = pdf_bytes_list[idx] + + _process_output( + pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir, + md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf, + f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output, + f_make_md_mode, middle_json, model_json, is_pipeline=True + ) + + +async def _async_process_vlm( + output_dir, + pdf_file_names, + pdf_bytes_list, + backend, + f_draw_layout_bbox, + f_draw_span_bbox, + f_dump_md, + f_dump_middle_json, + f_dump_model_output, + f_dump_orig_pdf, + f_dump_content_list, + f_make_md_mode, + server_url=None, + **kwargs, +): + """异步处理VLM后端逻辑""" + parse_method = "vlm" + f_draw_span_bbox = False + if not backend.endswith("client"): + server_url = None + + for idx, pdf_bytes in enumerate(pdf_bytes_list): + pdf_file_name = pdf_file_names[idx] + local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method) + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) + + middle_json, infer_result = await aio_vlm_doc_analyze( + pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url, **kwargs, + ) + + pdf_info = middle_json["pdf_info"] + + _process_output( + pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir, + md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf, + f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output, + f_make_md_mode, middle_json, infer_result, is_pipeline=False + ) + + +def _process_vlm( + output_dir, + pdf_file_names, + pdf_bytes_list, + backend, + f_draw_layout_bbox, + f_draw_span_bbox, + f_dump_md, + f_dump_middle_json, + f_dump_model_output, + f_dump_orig_pdf, + f_dump_content_list, + f_make_md_mode, + server_url=None, + **kwargs, +): + """同步处理VLM后端逻辑""" + parse_method = "vlm" + f_draw_span_bbox = False + if not backend.endswith("client"): + server_url = None + + for idx, pdf_bytes in enumerate(pdf_bytes_list): + pdf_file_name = pdf_file_names[idx] + local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method) + image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(local_md_dir) + + middle_json, infer_result = vlm_doc_analyze( + pdf_bytes, image_writer=image_writer, backend=backend, server_url=server_url, **kwargs, + ) + + pdf_info = middle_json["pdf_info"] + + _process_output( + pdf_info, pdf_bytes, pdf_file_name, local_md_dir, local_image_dir, + md_writer, f_draw_layout_bbox, f_draw_span_bbox, f_dump_orig_pdf, + f_dump_md, f_dump_content_list, f_dump_middle_json, f_dump_model_output, + f_make_md_mode, middle_json, infer_result, is_pipeline=False + ) + + +def do_parse( + output_dir, + pdf_file_names: list[str], + pdf_bytes_list: list[bytes], + p_lang_list: list[str], + backend="pipeline", + parse_method="auto", + formula_enable=True, + table_enable=True, + server_url=None, + f_draw_layout_bbox=True, + f_draw_span_bbox=True, + f_dump_md=True, + f_dump_middle_json=True, + f_dump_model_output=True, + f_dump_orig_pdf=True, + f_dump_content_list=True, + f_make_md_mode=MakeMode.MM_MD, + start_page_id=0, + end_page_id=None, + **kwargs, +): + # 预处理PDF字节数据 + pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id) + + if backend == "pipeline": + _process_pipeline( + output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, + parse_method, formula_enable, table_enable, + f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json, + f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode + ) + else: + if backend.startswith("vlm-"): + backend = backend[4:] + + os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable) + os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable) + + _process_vlm( + output_dir, pdf_file_names, pdf_bytes_list, backend, + f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json, + f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode, + server_url, **kwargs, + ) + + +async def aio_do_parse( + output_dir, + pdf_file_names: list[str], + pdf_bytes_list: list[bytes], + p_lang_list: list[str], + backend="pipeline", + parse_method="auto", + formula_enable=True, + table_enable=True, + server_url=None, + f_draw_layout_bbox=True, + f_draw_span_bbox=True, + f_dump_md=True, + f_dump_middle_json=True, + f_dump_model_output=True, + f_dump_orig_pdf=True, + f_dump_content_list=True, + f_make_md_mode=MakeMode.MM_MD, + start_page_id=0, + end_page_id=None, + **kwargs, +): + # 预处理PDF字节数据 + pdf_bytes_list = _prepare_pdf_bytes(pdf_bytes_list, start_page_id, end_page_id) + + if backend == "pipeline": + # pipeline模式暂不支持异步,使用同步处理方式 + _process_pipeline( + output_dir, pdf_file_names, pdf_bytes_list, p_lang_list, + parse_method, formula_enable, table_enable, + f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json, + f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode + ) + else: + if backend.startswith("vlm-"): + backend = backend[4:] + + os.environ['MINERU_VLM_FORMULA_ENABLE'] = str(formula_enable) + os.environ['MINERU_VLM_TABLE_ENABLE'] = str(table_enable) + + await _async_process_vlm( + output_dir, pdf_file_names, pdf_bytes_list, backend, + f_draw_layout_bbox, f_draw_span_bbox, f_dump_md, f_dump_middle_json, + f_dump_model_output, f_dump_orig_pdf, f_dump_content_list, f_make_md_mode, + server_url, **kwargs, + ) + + + +if __name__ == "__main__": + # pdf_path = "../../demo/pdfs/demo3.pdf" + pdf_path = "C:/Users/zhaoxiaomeng/Downloads/4546d0e2-ba60-40a5-a17e-b68555cec741.pdf" + + try: + do_parse("./output", [Path(pdf_path).stem], [read_fn(Path(pdf_path))],["ch"], + end_page_id=10, + backend='vlm-huggingface' + # backend = 'pipeline' + ) + except Exception as e: + logger.exception(e) diff --git a/vendor/mineru/mineru/cli/fast_api.py b/vendor/mineru/mineru/cli/fast_api.py new file mode 100644 index 0000000000000000000000000000000000000000..1bfc3e5d1beafc4e8955b1247379b8feed44d277 --- /dev/null +++ b/vendor/mineru/mineru/cli/fast_api.py @@ -0,0 +1,198 @@ +import uuid +import os +import uvicorn +import click +from pathlib import Path +from glob import glob +from fastapi import FastAPI, UploadFile, File, Form +from fastapi.middleware.gzip import GZipMiddleware +from fastapi.responses import JSONResponse +from typing import List, Optional +from loguru import logger +from base64 import b64encode + +from mineru.cli.common import aio_do_parse, read_fn, pdf_suffixes, image_suffixes +from mineru.utils.cli_parser import arg_parse +from mineru.version import __version__ + +app = FastAPI() +app.add_middleware(GZipMiddleware, minimum_size=1000) + +def encode_image(image_path: str) -> str: + """Encode image using base64""" + with open(image_path, "rb") as f: + return b64encode(f.read()).decode() + + +def get_infer_result(file_suffix_identifier: str, pdf_name: str, parse_dir: str) -> Optional[str]: + """从结果文件中读取推理结果""" + result_file_path = os.path.join(parse_dir, f"{pdf_name}{file_suffix_identifier}") + if os.path.exists(result_file_path): + with open(result_file_path, "r", encoding="utf-8") as fp: + return fp.read() + return None + + +@app.post(path="/file_parse",) +async def parse_pdf( + files: List[UploadFile] = File(...), + output_dir: str = Form("./output"), + lang_list: List[str] = Form(["ch"]), + backend: str = Form("pipeline"), + parse_method: str = Form("auto"), + formula_enable: bool = Form(True), + table_enable: bool = Form(True), + server_url: Optional[str] = Form(None), + return_md: bool = Form(True), + return_middle_json: bool = Form(False), + return_model_output: bool = Form(False), + return_content_list: bool = Form(False), + return_images: bool = Form(False), + start_page_id: int = Form(0), + end_page_id: int = Form(99999), +): + + # 获取命令行配置参数 + config = getattr(app.state, "config", {}) + + try: + # 创建唯一的输出目录 + unique_dir = os.path.join(output_dir, str(uuid.uuid4())) + os.makedirs(unique_dir, exist_ok=True) + + # 处理上传的PDF文件 + pdf_file_names = [] + pdf_bytes_list = [] + + for file in files: + content = await file.read() + file_path = Path(file.filename) + + # 如果是图像文件或PDF,使用read_fn处理 + if file_path.suffix.lower() in pdf_suffixes + image_suffixes: + # 创建临时文件以便使用read_fn + temp_path = Path(unique_dir) / file_path.name + with open(temp_path, "wb") as f: + f.write(content) + + try: + pdf_bytes = read_fn(temp_path) + pdf_bytes_list.append(pdf_bytes) + pdf_file_names.append(file_path.stem) + os.remove(temp_path) # 删除临时文件 + except Exception as e: + return JSONResponse( + status_code=400, + content={"error": f"Failed to load file: {str(e)}"} + ) + else: + return JSONResponse( + status_code=400, + content={"error": f"Unsupported file type: {file_path.suffix}"} + ) + + + # 设置语言列表,确保与文件数量一致 + actual_lang_list = lang_list + if len(actual_lang_list) != len(pdf_file_names): + # 如果语言列表长度不匹配,使用第一个语言或默认"ch" + actual_lang_list = [actual_lang_list[0] if actual_lang_list else "ch"] * len(pdf_file_names) + + # 调用异步处理函数 + await aio_do_parse( + output_dir=unique_dir, + pdf_file_names=pdf_file_names, + pdf_bytes_list=pdf_bytes_list, + p_lang_list=actual_lang_list, + backend=backend, + parse_method=parse_method, + formula_enable=formula_enable, + table_enable=table_enable, + server_url=server_url, + f_draw_layout_bbox=False, + f_draw_span_bbox=False, + f_dump_md=return_md, + f_dump_middle_json=return_middle_json, + f_dump_model_output=return_model_output, + f_dump_orig_pdf=False, + f_dump_content_list=return_content_list, + start_page_id=start_page_id, + end_page_id=end_page_id, + **config + ) + + # 构建结果路径 + result_dict = {} + for pdf_name in pdf_file_names: + result_dict[pdf_name] = {} + data = result_dict[pdf_name] + + if backend.startswith("pipeline"): + parse_dir = os.path.join(unique_dir, pdf_name, parse_method) + else: + parse_dir = os.path.join(unique_dir, pdf_name, "vlm") + + if os.path.exists(parse_dir): + if return_md: + data["md_content"] = get_infer_result(".md", pdf_name, parse_dir) + if return_middle_json: + data["middle_json"] = get_infer_result("_middle.json", pdf_name, parse_dir) + if return_model_output: + if backend.startswith("pipeline"): + data["model_output"] = get_infer_result("_model.json", pdf_name, parse_dir) + else: + data["model_output"] = get_infer_result("_model_output.txt", pdf_name, parse_dir) + if return_content_list: + data["content_list"] = get_infer_result("_content_list.json", pdf_name, parse_dir) + if return_images: + image_paths = glob(f"{parse_dir}/images/*.jpg") + data["images"] = { + os.path.basename( + image_path + ): f"data:image/jpeg;base64,{encode_image(image_path)}" + for image_path in image_paths + } + return JSONResponse( + status_code=200, + content={ + "backend": backend, + "version": __version__, + "results": result_dict + } + ) + except Exception as e: + logger.exception(e) + return JSONResponse( + status_code=500, + content={"error": f"Failed to process file: {str(e)}"} + ) + + +@click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) +@click.pass_context +@click.option('--host', default='127.0.0.1', help='Server host (default: 127.0.0.1)') +@click.option('--port', default=8000, type=int, help='Server port (default: 8000)') +@click.option('--reload', is_flag=True, help='Enable auto-reload (development mode)') +def main(ctx, host, port, reload, **kwargs): + + kwargs.update(arg_parse(ctx)) + + # 将配置参数存储到应用状态中 + app.state.config = kwargs + + """启动MinerU FastAPI服务器的命令行入口""" + print(f"Start MinerU FastAPI Service: http://{host}:{port}") + print("The API documentation can be accessed at the following address:") + print(f"- Swagger UI: http://{host}:{port}/docs") + print(f"- ReDoc: http://{host}:{port}/redoc") + + uvicorn.run( + "mineru.cli.fast_api:app", + host=host, + port=port, + reload=reload + ) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/vendor/mineru/mineru/cli/gradio_app.py b/vendor/mineru/mineru/cli/gradio_app.py new file mode 100644 index 0000000000000000000000000000000000000000..7d33dce581dd27941d86a3357495d452623f47cf --- /dev/null +++ b/vendor/mineru/mineru/cli/gradio_app.py @@ -0,0 +1,343 @@ +# Copyright (c) Opendatalab. All rights reserved. + +import base64 +import os +import re +import time +import zipfile +from pathlib import Path + +import click +import gradio as gr +from gradio_pdf import PDF +from loguru import logger + +from mineru.cli.common import prepare_env, read_fn, aio_do_parse, pdf_suffixes, image_suffixes +from mineru.utils.cli_parser import arg_parse +from mineru.utils.hash_utils import str_sha256 + + +async def parse_pdf(doc_path, output_dir, end_page_id, is_ocr, formula_enable, table_enable, language, backend, url): + os.makedirs(output_dir, exist_ok=True) + + try: + file_name = f'{safe_stem(Path(doc_path).stem)}_{time.strftime("%y%m%d_%H%M%S")}' + pdf_data = read_fn(doc_path) + if is_ocr: + parse_method = 'ocr' + else: + parse_method = 'auto' + + if backend.startswith("vlm"): + parse_method = "vlm" + + local_image_dir, local_md_dir = prepare_env(output_dir, file_name, parse_method) + await aio_do_parse( + output_dir=output_dir, + pdf_file_names=[file_name], + pdf_bytes_list=[pdf_data], + p_lang_list=[language], + parse_method=parse_method, + end_page_id=end_page_id, + formula_enable=formula_enable, + table_enable=table_enable, + backend=backend, + server_url=url, + ) + return local_md_dir, file_name + except Exception as e: + logger.exception(e) + return None + + +def compress_directory_to_zip(directory_path, output_zip_path): + """压缩指定目录到一个 ZIP 文件。 + + :param directory_path: 要压缩的目录路径 + :param output_zip_path: 输出的 ZIP 文件路径 + """ + try: + with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf: + + # 遍历目录中的所有文件和子目录 + for root, dirs, files in os.walk(directory_path): + for file in files: + # 构建完整的文件路径 + file_path = os.path.join(root, file) + # 计算相对路径 + arcname = os.path.relpath(file_path, directory_path) + # 添加文件到 ZIP 文件 + zipf.write(file_path, arcname) + return 0 + except Exception as e: + logger.exception(e) + return -1 + + +def image_to_base64(image_path): + with open(image_path, 'rb') as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + +def replace_image_with_base64(markdown_text, image_dir_path): + # 匹配Markdown中的图片标签 + pattern = r'\!\[(?:[^\]]*)\]\(([^)]+)\)' + + # 替换图片链接 + def replace(match): + relative_path = match.group(1) + full_path = os.path.join(image_dir_path, relative_path) + base64_image = image_to_base64(full_path) + return f'![{relative_path}](data:image/jpeg;base64,{base64_image})' + + # 应用替换 + return re.sub(pattern, replace, markdown_text) + + +async def to_markdown(file_path, end_pages=10, is_ocr=False, formula_enable=True, table_enable=True, language="ch", backend="pipeline", url=None): + file_path = to_pdf(file_path) + # 获取识别的md文件以及压缩包文件路径 + local_md_dir, file_name = await parse_pdf(file_path, './output', end_pages - 1, is_ocr, formula_enable, table_enable, language, backend, url) + archive_zip_path = os.path.join('./output', str_sha256(local_md_dir) + '.zip') + zip_archive_success = compress_directory_to_zip(local_md_dir, archive_zip_path) + if zip_archive_success == 0: + logger.info('Compression successful') + else: + logger.error('Compression failed') + md_path = os.path.join(local_md_dir, file_name + '.md') + with open(md_path, 'r', encoding='utf-8') as f: + txt_content = f.read() + md_content = replace_image_with_base64(txt_content, local_md_dir) + # 返回转换后的PDF路径 + new_pdf_path = os.path.join(local_md_dir, file_name + '_layout.pdf') + + return md_content, txt_content, archive_zip_path, new_pdf_path + + +latex_delimiters = [ + {'left': '$$', 'right': '$$', 'display': True}, + {'left': '$', 'right': '$', 'display': False}, + {'left': '\\(', 'right': '\\)', 'display': False}, + {'left': '\\[', 'right': '\\]', 'display': True}, +] + +header_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'resources', 'header.html') +with open(header_path, 'r') as header_file: + header = header_file.read() + + +latin_lang = [ + 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', # noqa: E126 + 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', + 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', + 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german' +] +arabic_lang = ['ar', 'fa', 'ug', 'ur'] +cyrillic_lang = [ + 'rs_cyrillic', 'bg', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126 + 'dar', 'inh', 'che', 'lbe', 'lez', 'tab' +] +east_slavic_lang = ["ru", "be", "uk"] +devanagari_lang = [ + 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126 + 'sa', 'bgc' +] +other_lang = ['ch', 'ch_lite', 'ch_server', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka'] +add_lang = ['latin', 'arabic', 'east_slavic', 'cyrillic', 'devanagari'] + +# all_lang = ['', 'auto'] +all_lang = [] +# all_lang.extend([*other_lang, *latin_lang, *arabic_lang, *cyrillic_lang, *devanagari_lang]) +all_lang.extend([*other_lang, *add_lang]) + + +def safe_stem(file_path): + stem = Path(file_path).stem + # 只保留字母、数字、下划线和点,其他字符替换为下划线 + return re.sub(r'[^\w.]', '_', stem) + + +def to_pdf(file_path): + + if file_path is None: + return None + + pdf_bytes = read_fn(file_path) + + # unique_filename = f'{uuid.uuid4()}.pdf' + unique_filename = f'{safe_stem(file_path)}.pdf' + + # 构建完整的文件路径 + tmp_file_path = os.path.join(os.path.dirname(file_path), unique_filename) + + # 将字节数据写入文件 + with open(tmp_file_path, 'wb') as tmp_pdf_file: + tmp_pdf_file.write(pdf_bytes) + + return tmp_file_path + + +# 更新界面函数 +def update_interface(backend_choice): + if backend_choice in ["vlm-transformers", "vlm-sglang-engine"]: + return gr.update(visible=False), gr.update(visible=False) + elif backend_choice in ["vlm-sglang-client"]: + return gr.update(visible=True), gr.update(visible=False) + elif backend_choice in ["pipeline"]: + return gr.update(visible=False), gr.update(visible=True) + else: + pass + + +@click.command(context_settings=dict(ignore_unknown_options=True, allow_extra_args=True)) +@click.pass_context +@click.option( + '--enable-example', + 'example_enable', + type=bool, + help="Enable example files for input." + "The example files to be input need to be placed in the `example` folder within the directory where the command is currently executed.", + default=True, +) +@click.option( + '--enable-sglang-engine', + 'sglang_engine_enable', + type=bool, + help="Enable SgLang engine backend for faster processing.", + default=False, +) +@click.option( + '--enable-api', + 'api_enable', + type=bool, + help="Enable gradio API for serving the application.", + default=True, +) +@click.option( + '--max-convert-pages', + 'max_convert_pages', + type=int, + help="Set the maximum number of pages to convert from PDF to Markdown.", + default=1000, +) +@click.option( + '--server-name', + 'server_name', + type=str, + help="Set the server name for the Gradio app.", + default=None, +) +@click.option( + '--server-port', + 'server_port', + type=int, + help="Set the server port for the Gradio app.", + default=None, +) +def main(ctx, + example_enable, sglang_engine_enable, api_enable, max_convert_pages, + server_name, server_port, **kwargs +): + + kwargs.update(arg_parse(ctx)) + + if sglang_engine_enable: + try: + print("Start init SgLang engine...") + from mineru.backend.vlm.vlm_analyze import ModelSingleton + model_singleton = ModelSingleton() + predictor = model_singleton.get_model( + "sglang-engine", + None, + None, + **kwargs + ) + print("SgLang engine init successfully.") + except Exception as e: + logger.exception(e) + + suffixes = pdf_suffixes + image_suffixes + with gr.Blocks() as demo: + gr.HTML(header) + with gr.Row(): + with gr.Column(variant='panel', scale=5): + with gr.Row(): + input_file = gr.File(label='Please upload a PDF or image', file_types=suffixes) + with gr.Row(): + max_pages = gr.Slider(1, max_convert_pages, int(max_convert_pages/2), step=1, label='Max convert pages') + with gr.Row(): + if sglang_engine_enable: + drop_list = ["pipeline", "vlm-sglang-engine"] + preferred_option = "vlm-sglang-engine" + else: + drop_list = ["pipeline", "vlm-transformers", "vlm-sglang-client"] + preferred_option = "pipeline" + backend = gr.Dropdown(drop_list, label="Backend", value=preferred_option) + with gr.Row(visible=False) as client_options: + url = gr.Textbox(label='Server URL', value='http://localhost:30000', placeholder='http://localhost:30000') + with gr.Row(equal_height=True): + with gr.Column(): + gr.Markdown("**Recognition Options:**") + formula_enable = gr.Checkbox(label='Enable formula recognition', value=True) + table_enable = gr.Checkbox(label='Enable table recognition', value=True) + with gr.Column(visible=False) as ocr_options: + language = gr.Dropdown(all_lang, label='Language', value='ch') + is_ocr = gr.Checkbox(label='Force enable OCR', value=False) + with gr.Row(): + change_bu = gr.Button('Convert') + clear_bu = gr.ClearButton(value='Clear') + pdf_show = PDF(label='PDF preview', interactive=False, visible=True, height=800) + if example_enable: + example_root = os.path.join(os.getcwd(), 'examples') + if os.path.exists(example_root): + with gr.Accordion('Examples:'): + gr.Examples( + examples=[os.path.join(example_root, _) for _ in os.listdir(example_root) if + _.endswith(tuple(suffixes))], + inputs=input_file + ) + + with gr.Column(variant='panel', scale=5): + output_file = gr.File(label='convert result', interactive=False) + with gr.Tabs(): + with gr.Tab('Markdown rendering'): + md = gr.Markdown(label='Markdown rendering', height=1100, show_copy_button=True, + latex_delimiters=latex_delimiters, + line_breaks=True) + with gr.Tab('Markdown text'): + md_text = gr.TextArea(lines=45, show_copy_button=True) + + # 添加事件处理 + backend.change( + fn=update_interface, + inputs=[backend], + outputs=[client_options, ocr_options], + api_name=False + ) + # 添加demo.load事件,在页面加载时触发一次界面更新 + demo.load( + fn=update_interface, + inputs=[backend], + outputs=[client_options, ocr_options], + api_name=False + ) + clear_bu.add([input_file, md, pdf_show, md_text, output_file, is_ocr]) + + if api_enable: + api_name = None + else: + api_name = False + + input_file.change(fn=to_pdf, inputs=input_file, outputs=pdf_show, api_name=api_name) + change_bu.click( + fn=to_markdown, + inputs=[input_file, max_pages, is_ocr, formula_enable, table_enable, language, backend, url], + outputs=[md, md_text, output_file, pdf_show], + api_name=api_name + ) + + demo.launch(server_name=server_name, server_port=server_port, show_api=api_enable) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/vendor/mineru/mineru/cli/models_download.py b/vendor/mineru/mineru/cli/models_download.py new file mode 100644 index 0000000000000000000000000000000000000000..8c35cd0acc91c91cc3a470f9fb8f1dc49e1b59e6 --- /dev/null +++ b/vendor/mineru/mineru/cli/models_download.py @@ -0,0 +1,150 @@ +import json +import os +import sys +import click +import requests +from loguru import logger + +from mineru.utils.enum_class import ModelPath +from mineru.utils.models_download_utils import auto_download_and_get_model_root_path + + +def download_json(url): + """下载JSON文件""" + response = requests.get(url) + response.raise_for_status() + return response.json() + + +def download_and_modify_json(url, local_filename, modifications): + """下载JSON并修改内容""" + if os.path.exists(local_filename): + data = json.load(open(local_filename)) + config_version = data.get('config_version', '0.0.0') + if config_version < '1.3.0': + data = download_json(url) + else: + data = download_json(url) + + # 修改内容 + for key, value in modifications.items(): + if key in data: + if isinstance(data[key], dict): + # 如果是字典,合并新值 + data[key].update(value) + else: + # 否则直接替换 + data[key] = value + + # 保存修改后的内容 + with open(local_filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + +def configure_model(model_dir, model_type): + """配置模型""" + json_url = 'https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/mineru.template.json' + config_file_name = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'mineru.json') + home_dir = os.path.expanduser('~') + config_file = os.path.join(home_dir, config_file_name) + + json_mods = { + 'models-dir': { + f'{model_type}': model_dir + } + } + + download_and_modify_json(json_url, config_file, json_mods) + logger.info(f'The configuration file has been successfully configured, the path is: {config_file}') + + +def download_pipeline_models(): + """下载Pipeline模型""" + model_paths = [ + ModelPath.doclayout_yolo, + ModelPath.yolo_v8_mfd, + ModelPath.unimernet_small, + ModelPath.pytorch_paddle, + ModelPath.layout_reader, + ModelPath.slanet_plus + ] + download_finish_path = "" + for model_path in model_paths: + logger.info(f"Downloading model: {model_path}") + download_finish_path = auto_download_and_get_model_root_path(model_path, repo_mode='pipeline') + logger.info(f"Pipeline models downloaded successfully to: {download_finish_path}") + configure_model(download_finish_path, "pipeline") + + +def download_vlm_models(): + """下载VLM模型""" + download_finish_path = auto_download_and_get_model_root_path("/", repo_mode='vlm') + logger.info(f"VLM models downloaded successfully to: {download_finish_path}") + configure_model(download_finish_path, "vlm") + + +@click.command() +@click.option( + '-s', + '--source', + 'model_source', + type=click.Choice(['huggingface', 'modelscope']), + help=""" + The source of the model repository. + """, + default=None, +) +@click.option( + '-m', + '--model_type', + 'model_type', + type=click.Choice(['pipeline', 'vlm', 'all']), + help=""" + The type of the model to download. + """, + default=None, +) +def download_models(model_source, model_type): + """Download MinerU model files. + + Supports downloading pipeline or VLM models from ModelScope or HuggingFace. + """ + # 如果未显式指定则交互式输入下载来源 + if model_source is None: + model_source = click.prompt( + "Please select the model download source: ", + type=click.Choice(['huggingface', 'modelscope']), + default='huggingface' + ) + + if os.getenv('MINERU_MODEL_SOURCE', None) is None: + os.environ['MINERU_MODEL_SOURCE'] = model_source + + # 如果未显式指定则交互式输入模型类型 + if model_type is None: + model_type = click.prompt( + "Please select the model type to download: ", + type=click.Choice(['pipeline', 'vlm', 'all']), + default='all' + ) + + logger.info(f"Downloading {model_type} model from {os.getenv('MINERU_MODEL_SOURCE', None)}...") + + try: + if model_type == 'pipeline': + download_pipeline_models() + elif model_type == 'vlm': + download_vlm_models() + elif model_type == 'all': + download_pipeline_models() + download_vlm_models() + else: + click.echo(f"Unsupported model type: {model_type}", err=True) + sys.exit(1) + + except Exception as e: + logger.exception(f"An error occurred while downloading models: {str(e)}") + sys.exit(1) + +if __name__ == '__main__': + download_models() diff --git a/vendor/mineru/mineru/cli/vlm_sglang_server.py b/vendor/mineru/mineru/cli/vlm_sglang_server.py new file mode 100644 index 0000000000000000000000000000000000000000..0aa7b54bd3fc9ccc7df826bcaaf72339fe03e8bf --- /dev/null +++ b/vendor/mineru/mineru/cli/vlm_sglang_server.py @@ -0,0 +1,4 @@ +from ..model.vlm_sglang_model.server import main + +if __name__ == "__main__": + main() diff --git a/vendor/mineru/mineru/data/__init__.py b/vendor/mineru/mineru/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/data/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/data/data_reader_writer/__init__.py b/vendor/mineru/mineru/data/data_reader_writer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..241f610b758acb0be53c8f2f40c1ec56d6e39185 --- /dev/null +++ b/vendor/mineru/mineru/data/data_reader_writer/__init__.py @@ -0,0 +1,17 @@ +from .base import DataReader, DataWriter +from .dummy import DummyDataWriter +from .filebase import FileBasedDataReader, FileBasedDataWriter +from .multi_bucket_s3 import MultiBucketS3DataReader, MultiBucketS3DataWriter +from .s3 import S3DataReader, S3DataWriter + +__all__ = [ + "DataReader", + "DataWriter", + "FileBasedDataReader", + "FileBasedDataWriter", + "S3DataReader", + "S3DataWriter", + "MultiBucketS3DataReader", + "MultiBucketS3DataWriter", + "DummyDataWriter", +] diff --git a/vendor/mineru/mineru/data/data_reader_writer/base.py b/vendor/mineru/mineru/data/data_reader_writer/base.py new file mode 100644 index 0000000000000000000000000000000000000000..d294b329559723303b1f42cb9f48c39f07ae3622 --- /dev/null +++ b/vendor/mineru/mineru/data/data_reader_writer/base.py @@ -0,0 +1,63 @@ + +from abc import ABC, abstractmethod + + +class DataReader(ABC): + + def read(self, path: str) -> bytes: + """Read the file. + + Args: + path (str): file path to read + + Returns: + bytes: the content of the file + """ + return self.read_at(path) + + @abstractmethod + def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: + """Read the file at offset and limit. + + Args: + path (str): the file path + offset (int, optional): the number of bytes skipped. Defaults to 0. + limit (int, optional): the length of bytes want to read. Defaults to -1. + + Returns: + bytes: the content of the file + """ + pass + + +class DataWriter(ABC): + @abstractmethod + def write(self, path: str, data: bytes) -> None: + """Write the data to the file. + + Args: + path (str): the target file where to write + data (bytes): the data want to write + """ + pass + + def write_string(self, path: str, data: str) -> None: + """Write the data to file, the data will be encoded to bytes. + + Args: + path (str): the target file where to write + data (str): the data want to write + """ + + def safe_encode(data: str, method: str): + try: + bit_data = data.encode(encoding=method, errors='replace') + return bit_data, True + except: # noqa + return None, False + + for method in ['utf-8', 'ascii']: + bit_data, flag = safe_encode(data, method) + if flag: + self.write(path, bit_data) + break diff --git a/vendor/mineru/mineru/data/data_reader_writer/dummy.py b/vendor/mineru/mineru/data/data_reader_writer/dummy.py new file mode 100644 index 0000000000000000000000000000000000000000..dbcf09c0356accd7822c4a773f85643339919a2b --- /dev/null +++ b/vendor/mineru/mineru/data/data_reader_writer/dummy.py @@ -0,0 +1,11 @@ +from .base import DataWriter + + +class DummyDataWriter(DataWriter): + def write(self, path: str, data: bytes) -> None: + """Dummy write method that does nothing.""" + pass + + def write_string(self, path: str, data: str) -> None: + """Dummy write_string method that does nothing.""" + pass diff --git a/vendor/mineru/mineru/data/data_reader_writer/filebase.py b/vendor/mineru/mineru/data/data_reader_writer/filebase.py new file mode 100644 index 0000000000000000000000000000000000000000..67af8e630f6086598dff5dd241c512907fcbac17 --- /dev/null +++ b/vendor/mineru/mineru/data/data_reader_writer/filebase.py @@ -0,0 +1,62 @@ +import os + +from .base import DataReader, DataWriter + + +class FileBasedDataReader(DataReader): + def __init__(self, parent_dir: str = ''): + """Initialized with parent_dir. + + Args: + parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''. + """ + self._parent_dir = parent_dir + + def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: + """Read at offset and limit. + + Args: + path (str): the path of file, if the path is relative path, it will be joined with parent_dir. + offset (int, optional): the number of bytes skipped. Defaults to 0. + limit (int, optional): the length of bytes want to read. Defaults to -1. + + Returns: + bytes: the content of file + """ + fn_path = path + if not os.path.isabs(fn_path) and len(self._parent_dir) > 0: + fn_path = os.path.join(self._parent_dir, path) + + with open(fn_path, 'rb') as f: + f.seek(offset) + if limit == -1: + return f.read() + else: + return f.read(limit) + + +class FileBasedDataWriter(DataWriter): + def __init__(self, parent_dir: str = '') -> None: + """Initialized with parent_dir. + + Args: + parent_dir (str, optional): the parent directory that may be used within methods. Defaults to ''. + """ + self._parent_dir = parent_dir + + def write(self, path: str, data: bytes) -> None: + """Write file with data. + + Args: + path (str): the path of file, if the path is relative path, it will be joined with parent_dir. + data (bytes): the data want to write + """ + fn_path = path + if not os.path.isabs(fn_path) and len(self._parent_dir) > 0: + fn_path = os.path.join(self._parent_dir, path) + + if not os.path.exists(os.path.dirname(fn_path)) and os.path.dirname(fn_path) != "": + os.makedirs(os.path.dirname(fn_path), exist_ok=True) + + with open(fn_path, 'wb') as f: + f.write(data) diff --git a/vendor/mineru/mineru/data/data_reader_writer/multi_bucket_s3.py b/vendor/mineru/mineru/data/data_reader_writer/multi_bucket_s3.py new file mode 100644 index 0000000000000000000000000000000000000000..0a7703ae29dbbe49bae33366fd484eaea0a245d5 --- /dev/null +++ b/vendor/mineru/mineru/data/data_reader_writer/multi_bucket_s3.py @@ -0,0 +1,144 @@ + +from ..utils.exceptions import InvalidConfig, InvalidParams +from .base import DataReader, DataWriter +from ..io.s3 import S3Reader, S3Writer +from ..utils.schemas import S3Config +from ..utils.path_utils import parse_s3_range_params, parse_s3path, remove_non_official_s3_args + + +class MultiS3Mixin: + def __init__(self, default_prefix: str, s3_configs: list[S3Config]): + """Initialized with multiple s3 configs. + + Args: + default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket} + s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list. + + Raises: + InvalidConfig: default bucket config not in s3_configs. + InvalidConfig: bucket name not unique in s3_configs. + InvalidConfig: default bucket must be provided. + """ + if len(default_prefix) == 0: + raise InvalidConfig('default_prefix must be provided') + + arr = default_prefix.strip('/').split('/') + self.default_bucket = arr[0] + self.default_prefix = '/'.join(arr[1:]) + + found_default_bucket_config = False + for conf in s3_configs: + if conf.bucket_name == self.default_bucket: + found_default_bucket_config = True + break + + if not found_default_bucket_config: + raise InvalidConfig( + f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}' + ) + + uniq_bucket = set([conf.bucket_name for conf in s3_configs]) + if len(uniq_bucket) != len(s3_configs): + raise InvalidConfig( + f'the bucket_name in s3_configs: {s3_configs} must be unique' + ) + + self.s3_configs = s3_configs + self._s3_clients_h: dict = {} + + +class MultiBucketS3DataReader(DataReader, MultiS3Mixin): + def read(self, path: str) -> bytes: + """Read the path from s3, select diffect bucket client for each request + based on the bucket, also support range read. + + Args: + path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit. + for example: s3://bucket_name/path?0,100. + + Returns: + bytes: the content of s3 file. + """ + may_range_params = parse_s3_range_params(path) + if may_range_params is None or 2 != len(may_range_params): + byte_start, byte_len = 0, -1 + else: + byte_start, byte_len = int(may_range_params[0]), int(may_range_params[1]) + path = remove_non_official_s3_args(path) + return self.read_at(path, byte_start, byte_len) + + def __get_s3_client(self, bucket_name: str): + if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]): + raise InvalidParams( + f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}' + ) + if bucket_name not in self._s3_clients_h: + conf = next( + filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs) + ) + self._s3_clients_h[bucket_name] = S3Reader( + bucket_name, + conf.access_key, + conf.secret_key, + conf.endpoint_url, + conf.addressing_style, + ) + return self._s3_clients_h[bucket_name] + + def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: + """Read the file with offset and limit, select diffect bucket client + for each request based on the bucket. + + Args: + path (str): the file path. + offset (int, optional): the number of bytes skipped. Defaults to 0. + limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite. + + Returns: + bytes: the file content. + """ + if path.startswith('s3://'): + bucket_name, path = parse_s3path(path) + s3_reader = self.__get_s3_client(bucket_name) + else: + s3_reader = self.__get_s3_client(self.default_bucket) + if self.default_prefix: + path = self.default_prefix + '/' + path + return s3_reader.read_at(path, offset, limit) + + +class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin): + def __get_s3_client(self, bucket_name: str): + if bucket_name not in set([conf.bucket_name for conf in self.s3_configs]): + raise InvalidParams( + f'bucket name: {bucket_name} not found in s3_configs: {self.s3_configs}' + ) + if bucket_name not in self._s3_clients_h: + conf = next( + filter(lambda conf: conf.bucket_name == bucket_name, self.s3_configs) + ) + self._s3_clients_h[bucket_name] = S3Writer( + bucket_name, + conf.access_key, + conf.secret_key, + conf.endpoint_url, + conf.addressing_style, + ) + return self._s3_clients_h[bucket_name] + + def write(self, path: str, data: bytes) -> None: + """Write file with data, also select diffect bucket client for each + request based on the bucket. + + Args: + path (str): the path of file, if the path is relative path, it will be joined with parent_dir. + data (bytes): the data want to write. + """ + if path.startswith('s3://'): + bucket_name, path = parse_s3path(path) + s3_writer = self.__get_s3_client(bucket_name) + else: + s3_writer = self.__get_s3_client(self.default_bucket) + if self.default_prefix: + path = self.default_prefix + '/' + path + return s3_writer.write(path, data) diff --git a/vendor/mineru/mineru/data/data_reader_writer/s3.py b/vendor/mineru/mineru/data/data_reader_writer/s3.py new file mode 100644 index 0000000000000000000000000000000000000000..d96343d94f8ed094ae40c1b9c283a48a59512998 --- /dev/null +++ b/vendor/mineru/mineru/data/data_reader_writer/s3.py @@ -0,0 +1,72 @@ +from .multi_bucket_s3 import MultiBucketS3DataReader, MultiBucketS3DataWriter +from ..utils.schemas import S3Config + + +class S3DataReader(MultiBucketS3DataReader): + def __init__( + self, + default_prefix_without_bucket: str, + bucket: str, + ak: str, + sk: str, + endpoint_url: str, + addressing_style: str = 'auto', + ): + """s3 reader client. + + Args: + default_prefix_without_bucket: prefix that not contains bucket + bucket (str): bucket name + ak (str): access key + sk (str): secret key + endpoint_url (str): endpoint url of s3 + addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual' + refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html + """ + super().__init__( + f'{bucket}/{default_prefix_without_bucket}', + [ + S3Config( + bucket_name=bucket, + access_key=ak, + secret_key=sk, + endpoint_url=endpoint_url, + addressing_style=addressing_style, + ) + ], + ) + + +class S3DataWriter(MultiBucketS3DataWriter): + def __init__( + self, + default_prefix_without_bucket: str, + bucket: str, + ak: str, + sk: str, + endpoint_url: str, + addressing_style: str = 'auto', + ): + """s3 writer client. + + Args: + default_prefix_without_bucket: prefix that not contains bucket + bucket (str): bucket name + ak (str): access key + sk (str): secret key + endpoint_url (str): endpoint url of s3 + addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual' + refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html + """ + super().__init__( + f'{bucket}/{default_prefix_without_bucket}', + [ + S3Config( + bucket_name=bucket, + access_key=ak, + secret_key=sk, + endpoint_url=endpoint_url, + addressing_style=addressing_style, + ) + ], + ) diff --git a/vendor/mineru/mineru/data/io/__init__.py b/vendor/mineru/mineru/data/io/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2cf63072b4da6712fc056d5f1fca256102e1c024 --- /dev/null +++ b/vendor/mineru/mineru/data/io/__init__.py @@ -0,0 +1,6 @@ + +from .base import IOReader, IOWriter +from .http import HttpReader, HttpWriter +from .s3 import S3Reader, S3Writer + +__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer'] \ No newline at end of file diff --git a/vendor/mineru/mineru/data/io/base.py b/vendor/mineru/mineru/data/io/base.py new file mode 100644 index 0000000000000000000000000000000000000000..3c163d1fe97f9f40820fbd710f85a67bcccd4b34 --- /dev/null +++ b/vendor/mineru/mineru/data/io/base.py @@ -0,0 +1,42 @@ +from abc import ABC, abstractmethod + + +class IOReader(ABC): + @abstractmethod + def read(self, path: str) -> bytes: + """Read the file. + + Args: + path (str): file path to read + + Returns: + bytes: the content of the file + """ + pass + + @abstractmethod + def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: + """Read at offset and limit. + + Args: + path (str): the path of file, if the path is relative path, it will be joined with parent_dir. + offset (int, optional): the number of bytes skipped. Defaults to 0. + limit (int, optional): the length of bytes want to read. Defaults to -1. + + Returns: + bytes: the content of file + """ + pass + + +class IOWriter(ABC): + + @abstractmethod + def write(self, path: str, data: bytes) -> None: + """Write file with data. + + Args: + path (str): the path of file, if the path is relative path, it will be joined with parent_dir. + data (bytes): the data want to write + """ + pass diff --git a/vendor/mineru/mineru/data/io/http.py b/vendor/mineru/mineru/data/io/http.py new file mode 100644 index 0000000000000000000000000000000000000000..f02e36ca8f27794bb58c406fab7ee613f910e869 --- /dev/null +++ b/vendor/mineru/mineru/data/io/http.py @@ -0,0 +1,37 @@ + +import io + +import requests + +from .base import IOReader, IOWriter + + +class HttpReader(IOReader): + + def read(self, url: str) -> bytes: + """Read the file. + + Args: + path (str): file path to read + + Returns: + bytes: the content of the file + """ + return requests.get(url).content + + def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes: + """Not Implemented.""" + raise NotImplementedError + + +class HttpWriter(IOWriter): + def write(self, url: str, data: bytes) -> None: + """Write file with data. + + Args: + path (str): the path of file, if the path is relative path, it will be joined with parent_dir. + data (bytes): the data want to write + """ + files = {'file': io.BytesIO(data)} + response = requests.post(url, files=files) + assert 300 > response.status_code and response.status_code > 199 diff --git a/vendor/mineru/mineru/data/io/s3.py b/vendor/mineru/mineru/data/io/s3.py new file mode 100644 index 0000000000000000000000000000000000000000..949054ee2e9da0fa649ac170920b879173fcac4c --- /dev/null +++ b/vendor/mineru/mineru/data/io/s3.py @@ -0,0 +1,114 @@ +import boto3 +from botocore.config import Config + +from ..io.base import IOReader, IOWriter + + +class S3Reader(IOReader): + def __init__( + self, + bucket: str, + ak: str, + sk: str, + endpoint_url: str, + addressing_style: str = 'auto', + ): + """s3 reader client. + + Args: + bucket (str): bucket name + ak (str): access key + sk (str): secret key + endpoint_url (str): endpoint url of s3 + addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual' + refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html + """ + self._bucket = bucket + self._ak = ak + self._sk = sk + self._s3_client = boto3.client( + service_name='s3', + aws_access_key_id=ak, + aws_secret_access_key=sk, + endpoint_url=endpoint_url, + config=Config( + s3={'addressing_style': addressing_style}, + retries={'max_attempts': 5, 'mode': 'standard'}, + ), + ) + + def read(self, key: str) -> bytes: + """Read the file. + + Args: + path (str): file path to read + + Returns: + bytes: the content of the file + """ + return self.read_at(key) + + def read_at(self, key: str, offset: int = 0, limit: int = -1) -> bytes: + """Read at offset and limit. + + Args: + path (str): the path of file, if the path is relative path, it will be joined with parent_dir. + offset (int, optional): the number of bytes skipped. Defaults to 0. + limit (int, optional): the length of bytes want to read. Defaults to -1. + + Returns: + bytes: the content of file + """ + if limit > -1: + range_header = f'bytes={offset}-{offset+limit-1}' + res = self._s3_client.get_object( + Bucket=self._bucket, Key=key, Range=range_header + ) + else: + res = self._s3_client.get_object( + Bucket=self._bucket, Key=key, Range=f'bytes={offset}-' + ) + return res['Body'].read() + + +class S3Writer(IOWriter): + def __init__( + self, + bucket: str, + ak: str, + sk: str, + endpoint_url: str, + addressing_style: str = 'auto', + ): + """s3 reader client. + + Args: + bucket (str): bucket name + ak (str): access key + sk (str): secret key + endpoint_url (str): endpoint url of s3 + addressing_style (str, optional): Defaults to 'auto'. Other valid options here are 'path' and 'virtual' + refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html + """ + self._bucket = bucket + self._ak = ak + self._sk = sk + self._s3_client = boto3.client( + service_name='s3', + aws_access_key_id=ak, + aws_secret_access_key=sk, + endpoint_url=endpoint_url, + config=Config( + s3={'addressing_style': addressing_style}, + retries={'max_attempts': 5, 'mode': 'standard'}, + ), + ) + + def write(self, key: str, data: bytes): + """Write file with data. + + Args: + path (str): the path of file, if the path is relative path, it will be joined with parent_dir. + data (bytes): the data want to write + """ + self._s3_client.put_object(Bucket=self._bucket, Key=key, Body=data) diff --git a/vendor/mineru/mineru/data/utils/__init__.py b/vendor/mineru/mineru/data/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/data/utils/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/data/utils/exceptions.py b/vendor/mineru/mineru/data/utils/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..0b0c52c642a90a17b7e79248b76e50d2fca9b52a --- /dev/null +++ b/vendor/mineru/mineru/data/utils/exceptions.py @@ -0,0 +1,40 @@ +# Copyright (c) Opendatalab. All rights reserved. + +class FileNotExisted(Exception): + + def __init__(self, path): + self.path = path + + def __str__(self): + return f'File {self.path} does not exist.' + + +class InvalidConfig(Exception): + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return f'Invalid config: {self.msg}' + + +class InvalidParams(Exception): + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return f'Invalid params: {self.msg}' + + +class EmptyData(Exception): + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return f'Empty data: {self.msg}' + +class CUDA_NOT_AVAILABLE(Exception): + def __init__(self, msg): + self.msg = msg + + def __str__(self): + return f'CUDA not available: {self.msg}' \ No newline at end of file diff --git a/vendor/mineru/mineru/data/utils/path_utils.py b/vendor/mineru/mineru/data/utils/path_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fc321f6f005b757343786865d8736c98b78a80d4 --- /dev/null +++ b/vendor/mineru/mineru/data/utils/path_utils.py @@ -0,0 +1,33 @@ +# Copyright (c) Opendatalab. All rights reserved. + + +def remove_non_official_s3_args(s3path): + """ + example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json + """ + arr = s3path.split("?") + return arr[0] + +def parse_s3path(s3path: str): + # from s3pathlib import S3Path + # p = S3Path(remove_non_official_s3_args(s3path)) + # return p.bucket, p.key + s3path = remove_non_official_s3_args(s3path).strip() + if s3path.startswith(('s3://', 's3a://')): + prefix, path = s3path.split('://', 1) + bucket_name, key = path.split('/', 1) + return bucket_name, key + elif s3path.startswith('/'): + raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.") + else: + raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.") + + +def parse_s3_range_params(s3path: str): + """ + example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350] + """ + arr = s3path.split("?bytes=") + if len(arr) == 1: + return None + return arr[1].split(",") diff --git a/vendor/mineru/mineru/data/utils/schemas.py b/vendor/mineru/mineru/data/utils/schemas.py new file mode 100644 index 0000000000000000000000000000000000000000..00634735f8c78cd96a59ade1cfdcfe00449bbfe7 --- /dev/null +++ b/vendor/mineru/mineru/data/utils/schemas.py @@ -0,0 +1,20 @@ +# Copyright (c) Opendatalab. All rights reserved. + +from pydantic import BaseModel, Field + + +class S3Config(BaseModel): + """S3 config + """ + bucket_name: str = Field(description='s3 bucket name', min_length=1) + access_key: str = Field(description='s3 access key', min_length=1) + secret_key: str = Field(description='s3 secret key', min_length=1) + endpoint_url: str = Field(description='s3 endpoint url', min_length=1) + addressing_style: str = Field(description='s3 addressing style', default='auto', min_length=1) + + +class PageInfo(BaseModel): + """The width and height of page + """ + w: float = Field(description='the width of page') + h: float = Field(description='the height of page') diff --git a/vendor/mineru/mineru/model/__init__.py b/vendor/mineru/mineru/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/model/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/model/layout/__init__.py b/vendor/mineru/mineru/model/layout/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/model/layout/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/model/layout/doclayout_yolo.py b/vendor/mineru/mineru/model/layout/doclayout_yolo.py new file mode 100644 index 0000000000000000000000000000000000000000..5667a9093f314c3319f638f0fa9b3c23650cd159 --- /dev/null +++ b/vendor/mineru/mineru/model/layout/doclayout_yolo.py @@ -0,0 +1,73 @@ +from typing import List, Dict, Union +from doclayout_yolo import YOLOv10 +from tqdm import tqdm +import numpy as np +from PIL import Image + + +class DocLayoutYOLOModel: + def __init__( + self, + weight: str, + device: str = "cuda", + imgsz: int = 1280, + conf: float = 0.1, + iou: float = 0.45, + ): + self.model = YOLOv10(weight).to(device) + self.device = device + self.imgsz = imgsz + self.conf = conf + self.iou = iou + + def _parse_prediction(self, prediction) -> List[Dict]: + layout_res = [] + + # 容错处理 + if not hasattr(prediction, "boxes") or prediction.boxes is None: + return layout_res + + for xyxy, conf, cls in zip( + prediction.boxes.xyxy.cpu(), + prediction.boxes.conf.cpu(), + prediction.boxes.cls.cpu(), + ): + coords = list(map(int, xyxy.tolist())) + xmin, ymin, xmax, ymax = coords + layout_res.append({ + "category_id": int(cls.item()), + "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], + "score": round(float(conf.item()), 3), + }) + return layout_res + + def predict(self, image: Union[np.ndarray, Image.Image]) -> List[Dict]: + prediction = self.model.predict( + image, + imgsz=self.imgsz, + conf=self.conf, + iou=self.iou, + verbose=False + )[0] + return self._parse_prediction(prediction) + + def batch_predict( + self, + images: List[Union[np.ndarray, Image.Image]], + batch_size: int = 4 + ) -> List[List[Dict]]: + results = [] + with tqdm(total=len(images), desc="Layout Predict") as pbar: + for idx in range(0, len(images), batch_size): + batch = images[idx: idx + batch_size] + predictions = self.model.predict( + batch, + imgsz=self.imgsz, + conf=self.conf, + iou=self.iou, + verbose=False, + ) + for pred in predictions: + results.append(self._parse_prediction(pred)) + pbar.update(len(batch)) + return results \ No newline at end of file diff --git a/vendor/mineru/mineru/model/mfd/__init__.py b/vendor/mineru/mineru/model/mfd/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/model/mfd/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/model/mfd/yolo_v8.py b/vendor/mineru/mineru/model/mfd/yolo_v8.py new file mode 100644 index 0000000000000000000000000000000000000000..33dac0911c6f1da2e67bcb238093c360dd34770b --- /dev/null +++ b/vendor/mineru/mineru/model/mfd/yolo_v8.py @@ -0,0 +1,53 @@ +from typing import List, Union +from tqdm import tqdm +from ultralytics import YOLO +import numpy as np +from PIL import Image + + +class YOLOv8MFDModel: + def __init__( + self, + weight: str, + device: str = "cpu", + imgsz: int = 1888, + conf: float = 0.25, + iou: float = 0.45, + ): + self.model = YOLO(weight).to(device) + self.device = device + self.imgsz = imgsz + self.conf = conf + self.iou = iou + + def _run_predict( + self, + inputs: Union[np.ndarray, Image.Image, List], + is_batch: bool = False + ) -> List: + preds = self.model.predict( + inputs, + imgsz=self.imgsz, + conf=self.conf, + iou=self.iou, + verbose=False, + device=self.device + ) + return [pred.cpu() for pred in preds] if is_batch else preds[0].cpu() + + def predict(self, image: Union[np.ndarray, Image.Image]): + return self._run_predict(image) + + def batch_predict( + self, + images: List[Union[np.ndarray, Image.Image]], + batch_size: int = 4 + ) -> List: + results = [] + with tqdm(total=len(images), desc="MFD Predict") as pbar: + for idx in range(0, len(images), batch_size): + batch = images[idx: idx + batch_size] + batch_preds = self._run_predict(batch, is_batch=True) + results.extend(batch_preds) + pbar.update(len(batch)) + return results \ No newline at end of file diff --git a/vendor/mineru/mineru/model/mfr/__init__.py b/vendor/mineru/mineru/model/mfr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/model/mfr/unimernet/Unimernet.py b/vendor/mineru/mineru/model/mfr/unimernet/Unimernet.py new file mode 100644 index 0000000000000000000000000000000000000000..5f1da89a6bfb9cf230241eaf84a35f0f6f357061 --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/Unimernet.py @@ -0,0 +1,135 @@ +import torch +from torch.utils.data import DataLoader, Dataset +from tqdm import tqdm + + +class MathDataset(Dataset): + def __init__(self, image_paths, transform=None): + self.image_paths = image_paths + self.transform = transform + + def __len__(self): + return len(self.image_paths) + + def __getitem__(self, idx): + raw_image = self.image_paths[idx] + if self.transform: + image = self.transform(raw_image) + return image + + +class UnimernetModel(object): + def __init__(self, weight_dir, _device_="cpu"): + from .unimernet_hf import UnimernetModel + if _device_.startswith("mps") or _device_.startswith("npu"): + self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager") + else: + self.model = UnimernetModel.from_pretrained(weight_dir) + self.device = _device_ + self.model.to(_device_) + if not _device_.startswith("cpu"): + self.model = self.model.to(dtype=torch.float16) + self.model.eval() + + def predict(self, mfd_res, image): + formula_list = [] + mf_image_list = [] + for xyxy, conf, cla in zip( + mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu() + ): + xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] + new_item = { + "category_id": 13 + int(cla.item()), + "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], + "score": round(float(conf.item()), 2), + "latex": "", + } + formula_list.append(new_item) + bbox_img = image[ymin:ymax, xmin:xmax] + mf_image_list.append(bbox_img) + + dataset = MathDataset(mf_image_list, transform=self.model.transform) + dataloader = DataLoader(dataset, batch_size=32, num_workers=0) + mfr_res = [] + for mf_img in dataloader: + mf_img = mf_img.to(dtype=self.model.dtype) + mf_img = mf_img.to(self.device) + with torch.no_grad(): + output = self.model.generate({"image": mf_img}) + mfr_res.extend(output["fixed_str"]) + for res, latex in zip(formula_list, mfr_res): + res["latex"] = latex + return formula_list + + def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list: + images_formula_list = [] + mf_image_list = [] + backfill_list = [] + image_info = [] # Store (area, original_index, image) tuples + + # Collect images with their original indices + for image_index in range(len(images_mfd_res)): + mfd_res = images_mfd_res[image_index] + pil_img = images[image_index] + formula_list = [] + + for idx, (xyxy, conf, cla) in enumerate(zip( + mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls + )): + xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] + new_item = { + "category_id": 13 + int(cla.item()), + "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], + "score": round(float(conf.item()), 2), + "latex": "", + } + formula_list.append(new_item) + bbox_img = pil_img.crop((xmin, ymin, xmax, ymax)) + area = (xmax - xmin) * (ymax - ymin) + + curr_idx = len(mf_image_list) + image_info.append((area, curr_idx, bbox_img)) + mf_image_list.append(bbox_img) + + images_formula_list.append(formula_list) + backfill_list += formula_list + + # Stable sort by area + image_info.sort(key=lambda x: x[0]) # sort by area + sorted_indices = [x[1] for x in image_info] + sorted_images = [x[2] for x in image_info] + + # Create mapping for results + index_mapping = {new_idx: old_idx for new_idx, old_idx in enumerate(sorted_indices)} + + # Create dataset with sorted images + dataset = MathDataset(sorted_images, transform=self.model.transform) + dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0) + + # Process batches and store results + mfr_res = [] + # for mf_img in dataloader: + + with tqdm(total=len(sorted_images), desc="MFR Predict") as pbar: + for index, mf_img in enumerate(dataloader): + mf_img = mf_img.to(dtype=self.model.dtype) + mf_img = mf_img.to(self.device) + with torch.no_grad(): + output = self.model.generate({"image": mf_img}) + mfr_res.extend(output["fixed_str"]) + + # 更新进度条,每次增加batch_size,但要注意最后一个batch可能不足batch_size + current_batch_size = min(batch_size, len(sorted_images) - index * batch_size) + pbar.update(current_batch_size) + + # Restore original order + unsorted_results = [""] * len(mfr_res) + for new_idx, latex in enumerate(mfr_res): + original_idx = index_mapping[new_idx] + unsorted_results[original_idx] = latex + + # Fill results back + for res, latex in zip(backfill_list, unsorted_results): + res["latex"] = latex + + return images_formula_list diff --git a/vendor/mineru/mineru/model/mfr/unimernet/__init__.py b/vendor/mineru/mineru/model/mfr/unimernet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/__init__.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..772dcfa32813a2f7befe217ee5addd3e4e6ee28a --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/__init__.py @@ -0,0 +1,13 @@ +from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor +from .unimer_mbart import UnimerMBartConfig, UnimerMBartModel, UnimerMBartForCausalLM +from .modeling_unimernet import UnimernetModel + +__all__ = [ + "UnimerSwinConfig", + "UnimerSwinModel", + "UnimerSwinImageProcessor", + "UnimerMBartConfig", + "UnimerMBartModel", + "UnimerMBartForCausalLM", + "UnimernetModel", +] diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py new file mode 100644 index 0000000000000000000000000000000000000000..d08b60931b5876df44990c216fd2f3aca12f149e --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/modeling_unimernet.py @@ -0,0 +1,495 @@ +import os +import re +import warnings +from typing import Optional + +import torch +from ftfy import fix_text +from loguru import logger + +from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel +from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel +from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import logger as base_model_logger + +from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor +from .unimer_mbart import UnimerMBartConfig, UnimerMBartForCausalLM + +AutoConfig.register(UnimerSwinConfig.model_type, UnimerSwinConfig) +AutoConfig.register(UnimerMBartConfig.model_type, UnimerMBartConfig) +AutoModel.register(UnimerSwinConfig, UnimerSwinModel) +AutoModelForCausalLM.register(UnimerMBartConfig, UnimerMBartForCausalLM) + + +# TODO: rewrite tokenizer +class TokenizerWrapper: + def __init__(self, tokenizer): + self.tokenizer = tokenizer + self.pad_token_id = self.tokenizer.pad_token_id + self.bos_token_id = self.tokenizer.bos_token_id + self.eos_token_id = self.tokenizer.eos_token_id + + def __len__(self): + return len(self.tokenizer) + + def tokenize(self, text, **kwargs): + return self.tokenizer( + text, + return_token_type_ids=False, + return_tensors="pt", + padding="longest", + truncation=True, + **kwargs, + ) + + def token2str(self, tokens) -> list: + generated_text = self.tokenizer.batch_decode(tokens, skip_special_tokens=True) + generated_text = [fix_text(text) for text in generated_text] + return generated_text + + def detokenize(self, tokens): + toks = [self.tokenizer.convert_ids_to_tokens(tok) for tok in tokens] + for b in range(len(toks)): + for i in reversed(range(len(toks[b]))): + if toks[b][i] is None: + toks[b][i] = '' + toks[b][i] = toks[b][i].replace('Ġ', ' ').strip() + if toks[b][i] in ([self.tokenizer.bos_token, self.tokenizer.eos_token, self.tokenizer.pad_token]): + del toks[b][i] + return toks + + +LEFT_PATTERN = re.compile(r'(\\left)(\S*)') +RIGHT_PATTERN = re.compile(r'(\\right)(\S*)') +LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])') +RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])') +LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?') + +def fix_latex_left_right(s): + """ + 修复LaTeX中的\\left和\\right命令 + 1. 确保它们后面跟有效分隔符 + 2. 平衡\\left和\\right的数量 + """ + # 白名单分隔符 + valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|', + r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor', + r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow', + r'\Uparrow', r'\Downarrow', r'\|', r'\.'] + + # 为\left后缺失有效分隔符的情况添加点 + def fix_delim(match, is_left=True): + cmd = match.group(1) # \left 或 \right + rest = match.group(2) if len(match.groups()) > 1 else "" + if not rest or rest not in valid_delims_list: + return cmd + "." + return match.group(0) + + # 使用更精确的模式匹配\left和\right命令 + # 确保它们是独立的命令,不是其他命令的一部分 + # 使用预编译正则和统一回调函数 + s = LEFT_PATTERN.sub(lambda m: fix_delim(m, True), s) + s = RIGHT_PATTERN.sub(lambda m: fix_delim(m, False), s) + + # 更精确地计算\left和\right的数量 + left_count = len(LEFT_COUNT_PATTERN.findall(s)) # 不匹配\lefteqn等 + right_count = len(RIGHT_COUNT_PATTERN.findall(s)) # 不匹配\rightarrow等 + + if left_count == right_count: + # 如果数量相等,检查是否在同一组 + return fix_left_right_pairs(s) + else: + # 如果数量不等,移除所有\left和\right + # logger.debug(f"latex:{s}") + # logger.warning(f"left_count: {left_count}, right_count: {right_count}") + return LEFT_RIGHT_REMOVE_PATTERN.sub('', s) + + +def fix_left_right_pairs(latex_formula): + """ + 检测并修复LaTeX公式中\\left和\\right不在同一组的情况 + + Args: + latex_formula (str): 输入的LaTeX公式 + + Returns: + str: 修复后的LaTeX公式 + """ + # 用于跟踪花括号嵌套层级 + brace_stack = [] + # 用于存储\left信息: (位置, 深度, 分隔符) + left_stack = [] + # 存储需要调整的\right信息: (开始位置, 结束位置, 目标位置) + adjustments = [] + + i = 0 + while i < len(latex_formula): + # 检查是否是转义字符 + if i > 0 and latex_formula[i - 1] == '\\': + backslash_count = 0 + j = i - 1 + while j >= 0 and latex_formula[j] == '\\': + backslash_count += 1 + j -= 1 + + if backslash_count % 2 == 1: + i += 1 + continue + + # 检测\left命令 + if i + 5 < len(latex_formula) and latex_formula[i:i + 5] == "\\left" and i + 5 < len(latex_formula): + delimiter = latex_formula[i + 5] + left_stack.append((i, len(brace_stack), delimiter)) + i += 6 # 跳过\left和分隔符 + continue + + # 检测\right命令 + elif i + 6 < len(latex_formula) and latex_formula[i:i + 6] == "\\right" and i + 6 < len(latex_formula): + delimiter = latex_formula[i + 6] + + if left_stack: + left_pos, left_depth, left_delim = left_stack.pop() + + # 如果\left和\right不在同一花括号深度 + if left_depth != len(brace_stack): + # 找到\left所在花括号组的结束位置 + target_pos = find_group_end(latex_formula, left_pos, left_depth) + if target_pos != -1: + # 记录需要移动的\right + adjustments.append((i, i + 7, target_pos)) + + i += 7 # 跳过\right和分隔符 + continue + + # 处理花括号 + if latex_formula[i] == '{': + brace_stack.append(i) + elif latex_formula[i] == '}': + if brace_stack: + brace_stack.pop() + + i += 1 + + # 应用调整,从后向前处理以避免索引变化 + if not adjustments: + return latex_formula + + result = list(latex_formula) + adjustments.sort(reverse=True, key=lambda x: x[0]) + + for start, end, target in adjustments: + # 提取\right部分 + right_part = result[start:end] + # 从原位置删除 + del result[start:end] + # 在目标位置插入 + result.insert(target, ''.join(right_part)) + + return ''.join(result) + + +def find_group_end(text, pos, depth): + """查找特定深度的花括号组的结束位置""" + current_depth = depth + i = pos + + while i < len(text): + if text[i] == '{' and (i == 0 or not is_escaped(text, i)): + current_depth += 1 + elif text[i] == '}' and (i == 0 or not is_escaped(text, i)): + current_depth -= 1 + if current_depth < depth: + return i + i += 1 + + return -1 # 未找到对应结束位置 + + +def is_escaped(text, pos): + """检查字符是否被转义""" + backslash_count = 0 + j = pos - 1 + while j >= 0 and text[j] == '\\': + backslash_count += 1 + j -= 1 + + return backslash_count % 2 == 1 + + +def fix_unbalanced_braces(latex_formula): + """ + 检测LaTeX公式中的花括号是否闭合,并删除无法配对的花括号 + + Args: + latex_formula (str): 输入的LaTeX公式 + + Returns: + str: 删除无法配对的花括号后的LaTeX公式 + """ + stack = [] # 存储左括号的索引 + unmatched = set() # 存储不匹配括号的索引 + i = 0 + + while i < len(latex_formula): + # 检查是否是转义的花括号 + if latex_formula[i] in ['{', '}']: + # 计算前面连续的反斜杠数量 + backslash_count = 0 + j = i - 1 + while j >= 0 and latex_formula[j] == '\\': + backslash_count += 1 + j -= 1 + + # 如果前面有奇数个反斜杠,则该花括号是转义的,不参与匹配 + if backslash_count % 2 == 1: + i += 1 + continue + + # 否则,该花括号参与匹配 + if latex_formula[i] == '{': + stack.append(i) + else: # latex_formula[i] == '}' + if stack: # 有对应的左括号 + stack.pop() + else: # 没有对应的左括号 + unmatched.add(i) + + i += 1 + + # 所有未匹配的左括号 + unmatched.update(stack) + + # 构建新字符串,删除不匹配的括号 + return ''.join(char for i, char in enumerate(latex_formula) if i not in unmatched) + + +def process_latex(input_string): + """ + 处理LaTeX公式中的反斜杠: + 1. 如果\后跟特殊字符(#$%&~_^\\{})或空格,保持不变 + 2. 如果\后跟两个小写字母,保持不变 + 3. 其他情况,在\后添加空格 + + Args: + input_string (str): 输入的LaTeX公式 + + Returns: + str: 处理后的LaTeX公式 + """ + + def replace_func(match): + # 获取\后面的字符 + next_char = match.group(1) + + # 如果是特殊字符或空格,保持不变 + if next_char in "#$%&~_^|\\{} \t\n\r\v\f": + return match.group(0) + + # 如果是字母,检查下一个字符 + if 'a' <= next_char <= 'z' or 'A' <= next_char <= 'Z': + pos = match.start() + 2 # \x后的位置 + if pos < len(input_string) and ('a' <= input_string[pos] <= 'z' or 'A' <= input_string[pos] <= 'Z'): + # 下一个字符也是字母,保持不变 + return match.group(0) + + # 其他情况,在\后添加空格 + return '\\' + ' ' + next_char + + # 匹配\后面跟一个字符的情况 + pattern = r'\\(.)' + + return re.sub(pattern, replace_func, input_string) + +# 常见的在KaTeX/MathJax中可用的数学环境 +ENV_TYPES = ['array', 'matrix', 'pmatrix', 'bmatrix', 'vmatrix', + 'Bmatrix', 'Vmatrix', 'cases', 'aligned', 'gathered'] +ENV_BEGIN_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}') for env in ENV_TYPES} +ENV_END_PATTERNS = {env: re.compile(r'\\end\{' + env + r'\}') for env in ENV_TYPES} +ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') for env in ENV_TYPES} + +def fix_latex_environments(s): + """ + 检测LaTeX中环境(如array)的\\begin和\\end是否匹配 + 1. 如果缺少\\begin标签则在开头添加 + 2. 如果缺少\\end标签则在末尾添加 + """ + for env in ENV_TYPES: + begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s)) + end_count = len(ENV_END_PATTERNS[env].findall(s)) + + if begin_count != end_count: + if end_count > begin_count: + format_match = ENV_FORMAT_PATTERNS[env].search(s) + default_format = '{c}' if env == 'array' else '' + format_str = '{' + format_match.group(1) + '}' if format_match else default_format + + missing_count = end_count - begin_count + begin_command = '\\begin{' + env + '}' + format_str + ' ' + s = begin_command * missing_count + s + else: + missing_count = begin_count - end_count + s = s + (' \\end{' + env + '}') * missing_count + + return s + + +UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)') +COMMANDS_TO_REMOVE_PATTERN = re.compile( + r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)') +REPLACEMENTS_PATTERNS = { + re.compile(r'\\underbar'): r'\\underline', + re.compile(r'\\Bar'): r'\\hat', + re.compile(r'\\Hat'): r'\\hat', + re.compile(r'\\Tilde'): r'\\tilde', + re.compile(r'\\slash'): r'/', + re.compile(r'\\textperthousand'): r'‰', + re.compile(r'\\sun'): r'☉', + re.compile(r'\\textunderscore'): r'\\_', + re.compile(r'\\fint'): r'⨏', + re.compile(r'\\up '): r'\\ ', + re.compile(r'\\vline = '): r'\\models ', + re.compile(r'\\vDash '): r'\\models ', + re.compile(r'\\sq \\sqcup '): r'\\square ', + re.compile(r'\\copyright'): r'©', +} +QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)') + +def latex_rm_whitespace(s: str): + """Remove unnecessary whitespace from LaTeX code.""" + s = fix_unbalanced_braces(s) + s = fix_latex_left_right(s) + s = fix_latex_environments(s) + + # 使用预编译的正则表达式 + s = UP_PATTERN.sub( + lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", s + ) + s = COMMANDS_TO_REMOVE_PATTERN.sub('', s) + + # 应用所有替换 + for pattern, replacement in REPLACEMENTS_PATTERNS.items(): + s = pattern.sub(replacement, s) + + # 处理LaTeX中的反斜杠和空格 + s = process_latex(s) + + # \qquad后补空格 + s = QQUAD_PATTERN.sub(r'\\qquad ', s) + + # 如果字符串以反斜杠结尾,去掉最后的反斜杠 + while s.endswith('\\'): + s = s[:-1] + + return s + + +class UnimernetModel(VisionEncoderDecoderModel): + def __init__( + self, + config: Optional[PretrainedConfig] = None, + encoder: Optional[PreTrainedModel] = None, + decoder: Optional[PreTrainedModel] = None, + ): + # VisionEncoderDecoderModel's checking log has bug, disable for temp. + base_model_logger.disabled = True + try: + super().__init__(config, encoder, decoder) + finally: + base_model_logger.disabled = False + + if not config or not hasattr(config, "_name_or_path"): + raise RuntimeError("config._name_or_path is required by UnimernetModel.") + + model_path = config._name_or_path + self.transform = UnimerSwinImageProcessor() + self.tokenizer = TokenizerWrapper(AutoTokenizer.from_pretrained(model_path)) + self._post_check() + + def _post_check(self): + tokenizer = self.tokenizer + + if tokenizer.tokenizer.model_max_length != self.config.decoder.max_position_embeddings: + warnings.warn( + f"decoder.max_position_embeddings={self.config.decoder.max_position_embeddings}," + + f" but tokenizer.model_max_length={tokenizer.tokenizer.model_max_length}, will set" + + f" tokenizer.model_max_length to {self.config.decoder.max_position_embeddings}.") + tokenizer.tokenizer.model_max_length = self.config.decoder.max_position_embeddings + + assert self.config.decoder.vocab_size == len(tokenizer) + assert self.config.decoder_start_token_id == tokenizer.bos_token_id + assert self.config.pad_token_id == tokenizer.pad_token_id + + @classmethod + def from_checkpoint(cls, model_path: str, model_filename: str = "pytorch_model.pth", state_dict_strip_prefix="model.model."): + config = VisionEncoderDecoderConfig.from_pretrained(model_path) + config._name_or_path = model_path + config.encoder = UnimerSwinConfig(**vars(config.encoder)) + config.decoder = UnimerMBartConfig(**vars(config.decoder)) + + encoder = UnimerSwinModel(config.encoder) + decoder = UnimerMBartForCausalLM(config.decoder) + model = cls(config, encoder, decoder) + + # load model weights + model_file_path = os.path.join(model_path, model_filename) + checkpoint = torch.load(model_file_path, map_location="cpu", weights_only=True) + state_dict = checkpoint["model"] if "model" in checkpoint else checkpoint + if not state_dict: + raise RuntimeError("state_dict is empty.") + if state_dict_strip_prefix: + state_dict = { + k[len(state_dict_strip_prefix):] if k.startswith(state_dict_strip_prefix) else k: v + for k, v in state_dict.items() + } + missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False) + if len(unexpected_keys) > 0: + warnings.warn("Unexpected key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in unexpected_keys))) + if len(missing_keys) > 0: + raise RuntimeError("Missing key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in missing_keys))) + return model + + def forward_bak(self, samples): + pixel_values, text = samples["image"], samples["text_input"] + + text_inputs = self.tokenizer.tokenize(text).to(pixel_values.device) + decoder_input_ids, decoder_attention_mask = text_inputs["input_ids"], text_inputs["attention_mask"] + + num_channels = pixel_values.shape[1] + if num_channels == 1: + pixel_values = pixel_values.repeat(1, 3, 1, 1) + + labels = decoder_input_ids * 1 + labels = labels.masked_fill(labels == self.tokenizer.pad_token_id, -100) + + loss = self.model( + pixel_values=pixel_values, + decoder_input_ids=decoder_input_ids[:, :-1], + decoder_attention_mask=decoder_attention_mask[:, :-1], + labels=labels[:, 1:], + ).loss + return {"loss": loss} + + def generate(self, samples, do_sample: bool = False, temperature: float = 0.2, top_p: float = 0.95): + pixel_values = samples["image"] + num_channels = pixel_values.shape[1] + if num_channels == 1: + pixel_values = pixel_values.repeat(1, 3, 1, 1) + + kwargs = {} + if do_sample: + kwargs["temperature"] = temperature + kwargs["top_p"] = top_p + + outputs = super().generate( + pixel_values=pixel_values, + max_new_tokens=self.tokenizer.tokenizer.model_max_length, # required + decoder_start_token_id=self.tokenizer.tokenizer.bos_token_id, + do_sample=do_sample, + **kwargs, + ) + + outputs = outputs[:, 1:].cpu().numpy() + pred_tokens = self.tokenizer.detokenize(outputs) + pred_str = self.tokenizer.token2str(outputs) + fixed_str = [latex_rm_whitespace(s) for s in pred_str] + return {"pred_ids": outputs, "pred_tokens": pred_tokens, "pred_str": pred_str, "fixed_str": fixed_str} + diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..155a786bf087fad4c9707dd6a38d8f3a252937b3 --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py @@ -0,0 +1,8 @@ +from .configuration_unimer_mbart import UnimerMBartConfig +from .modeling_unimer_mbart import UnimerMBartModel, UnimerMBartForCausalLM + +__all__ = [ + "UnimerMBartConfig", + "UnimerMBartModel", + "UnimerMBartForCausalLM", +] diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py new file mode 100644 index 0000000000000000000000000000000000000000..eef4a57d069104e0f45b93c02454e82e41d084c4 --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py @@ -0,0 +1,163 @@ +# coding=utf-8 +# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""UnimerMBART model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + + +class UnimerMBartConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MBartModel`]. It is used to instantiate an MBART + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the MBART + [facebook/mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MBartModel`] or [`TFMBartModel`]. + d_model (`int`, *optional*, defaults to 1024): + Dimensionality of the layers and the pooler layer. + qk_squeeze (`int`, *optional*, defaults to 2): + Squeeze ratio for query/key's output dimension. See the [UniMERNet paper](https://arxiv.org/abs/2404.15254). + Squeeze Attention maps the query and key to a lower-dimensional space without excessive loss of information, + thereby accelerating the computation of attention. + encoder_layers (`int`, *optional*, defaults to 12): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 12): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + classifier_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for classifier. + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + scale_embedding (`bool`, *optional*, defaults to `False`): + Scale embeddings by diving by sqrt(d_model). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models) + forced_eos_token_id (`int`, *optional*, defaults to 2): + The id of the token to force as the last generated token when `max_length` is reached. Usually set to + `eos_token_id`. + + Example: + + ```python + >>> from transformers import MBartConfig, MBartModel + + >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration + >>> configuration = MBartConfig() + + >>> # Initializing a model (with random weights) from the facebook/mbart-large-cc25 style configuration + >>> model = MBartModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "unimer-mbart" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=50265, + max_position_embeddings=1024, + encoder_layers=12, + encoder_ffn_dim=4096, + encoder_attention_heads=16, + decoder_layers=12, + decoder_ffn_dim=4096, + decoder_attention_heads=16, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + use_cache=True, + is_encoder_decoder=True, + activation_function="gelu", + d_model=1024, + qk_squeeze=2, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + classifier_dropout=0.0, + scale_embedding=False, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + forced_eos_token_id=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.qk_squeeze = qk_squeeze + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.classifier_dropout = classifier_dropout + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py new file mode 100644 index 0000000000000000000000000000000000000000..08a5a049a2bad5b1ea3c61edc7d6aac01c4f95a1 --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py @@ -0,0 +1,2351 @@ +# coding=utf-8 +# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch UnimerMBART model.""" + +import copy +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.modeling_attn_mask_utils import ( + _prepare_4d_attention_mask, + _prepare_4d_attention_mask_for_sdpa, + _prepare_4d_causal_attention_mask, + _prepare_4d_causal_attention_mask_for_sdpa, +) +from transformers.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, +) +from transformers import GenerationMixin, PreTrainedModel +from transformers.utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_unimer_mbart import UnimerMBartConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25" +_CONFIG_FOR_DOC = "MBartConfig" + +# Base model docstring +_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024] + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int): + """ + Shift input ids one token to the right, and wrap the last non pad token (the token) Note that MBart does not + have a single `decoder_start_token_id` in contrast to other Bart-like models. + """ + prev_output_tokens = input_ids.clone() + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id) + + index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1) + decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze() + prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone() + prev_output_tokens[:, 0] = decoder_start_tokens + + return prev_output_tokens + +@dataclass +class CausalLMOutputWithCrossAttentionsAndCounting(CausalLMOutputWithCrossAttentions): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Cross attentions weights after the attention softmax, used to compute the weighted average in the + cross-attention heads. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key, + value states of the self-attention and the cross-attention layers if model is used in encoder-decoder + setting. Only relevant if `config.is_decoder = True`. + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + counting: + Counting + """ + counting: Optional[torch.FloatTensor] = None + +# Copied from transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding with Bart->MBart +class UnimerMBartLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0): + """`input_ids' shape is expected to be [bsz x seqlen].""" + + bsz, seq_len = input_ids.shape[:2] + positions = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ).expand(bsz, -1) + + return super().forward(positions + self.offset) + + +# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->MBart +class UnimerMBartScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->MBart +class UnimerMBartAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper, with qk_squeeze""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + *, + config: UnimerMBartConfig, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + + self.squeeze_dim = embed_dim // config.qk_squeeze + self.squeeze_head_dim = self.squeeze_dim // num_heads + self.scaling = self.squeeze_head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + self.q_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias) + self.k_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim).transpose(1, 2).contiguous() + + def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.squeeze_head_dim) + value_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape_qk(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*value_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->MBart +class UnimerMBartFlashAttention2(UnimerMBartAttention): + """ + MBart flash attention module. This module inherits from `MBartSqueezeAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + # def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + # return tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + + def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim) + + def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # MBartFlashAttention2 attention does not support output_attentions + if output_attentions: + raise ValueError("MBartFlashAttention2 attention does not support output_attentions") + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, q_len, _ = hidden_states.size() + + # get query proj + query_states = self._shape_qk(self.q_proj(hidden_states), -1, bsz) + + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0].transpose(1, 2) + value_states = past_key_value[1].transpose(1, 2) + elif is_cross_attention: + # cross_attentions + key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1) + value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1) + else: + # self_attention + key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2)) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout + ) + + attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + +class UnimerMBartSdpaAttention(UnimerMBartAttention): + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + if output_attentions or layer_head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented. + logger.warning( + "BartModel is using BartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention" + ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + key_value_states=key_value_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + query_states = self._shape_qk(query_states, tgt_len, bsz) + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1. + is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False + + # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask, + # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577 + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.dropout if self.training else 0.0, + is_causal=is_causal, + ) + + if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None, past_key_value + +UNIMER_MBART_ATTENTION_CLASSES = { + "eager": UnimerMBartAttention, + "flash_attention_2": UnimerMBartFlashAttention2, + "sdpa": UnimerMBartSdpaAttention, +} + + +class UnimerMBartEncoderLayer(nn.Module): + def __init__(self, config: UnimerMBartConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, + output_attentions: bool = False, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class UnimerMBartDecoderLayer(nn.Module): + def __init__(self, config: UnimerMBartConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + is_causal=True, + config=config, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation]( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + config=config, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +# Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->MBart +class UnimerMBartClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim: int, + inner_dim: int, + num_classes: int, + pooler_dropout: float, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class UnimerMBartPreTrainedModel(PreTrainedModel): + config_class = UnimerMBartConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MBartDecoderLayer", "MBartSqueezeAttention"] + _supports_flash_attn_2 = True + _supports_sdpa = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + } + return dummy_inputs + + +MBART_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`MBartConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +MBART_GENERATION_EXAMPLE = r""" + Translation example: + + ```python + >>> from transformers import AutoTokenizer, MBartForConditionalGeneration + + >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro") + + >>> example_english_phrase = "42 is the answer" + >>> inputs = tokenizer(example_english_phrase, return_tensors="pt") + + >>> # Translate + >>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5) + >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + '42 este răspuns' + ``` + + Mask filling example: + + ```python + >>> from transformers import AutoTokenizer, MBartForConditionalGeneration + + >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25") + + >>> # de_DE is the language symbol id for German + >>> TXT = " Meine Freunde sind nett aber sie essen zu viel Kuchen. de_DE" + + >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt")["input_ids"] + >>> logits = model(input_ids).logits + + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + >>> probs = logits[0, masked_index].softmax(dim=0) + >>> values, predictions = probs.topk(5) + + >>> tokenizer.decode(predictions).split() + ['nett', 'sehr', 'ganz', 'nicht', 'so'] + ``` +""" + +MBART_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + MBart uses a specific language id token as the starting token for `decoder_input_ids` generation that + varies according to source and target language, *e.g.* 25004 for *en_XX*, and 25003 for *de_DE*. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value + of `inputs_embeds`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class UnimerMBartEncoder(UnimerMBartPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`MBartEncoderLayer`]. + + Args: + config: MBartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = UnimerMBartScaledWordEmbedding( + config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = UnimerMBartLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([UnimerMBartEncoderLayer(config) for _ in range(config.encoder_layers)]) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self._use_sdpa = config._attn_implementation == "sdpa" + self.layernorm_embedding = nn.LayerNorm(embed_dim) + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def _backward_compatibility_gradient_checkpointing(self): + # Override to not delete the attribute from the config + if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False): + self.gradient_checkpointing_enable() + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.shape + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(input) + + hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device) + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + if self._use_flash_attention_2: + attention_mask = attention_mask if 0 in attention_mask else None + elif self._use_sdpa and head_mask is None and not output_attentions: + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != len(self.layers): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class UnimerMBartDecoder(UnimerMBartPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`] + + Args: + config: MBartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = UnimerMBartScaledWordEmbedding( + config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = UnimerMBartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + self.layers = nn.ModuleList([UnimerMBartDecoderLayer(config) for _ in range(config.decoder_layers)]) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self._use_sdpa = config._attn_implementation == "sdpa" + self.layernorm_embedding = nn.LayerNorm(config.d_model) + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + count_pred: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None: + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if self._use_flash_attention_2: + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions: + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # embed positions + positions = self.embed_positions(input, past_key_values_length) + + hidden_states = inputs_embeds + positions.to(inputs_embeds.device) + + # TODO: add counting context weight to hidden_states + if count_pred is not None: + count_context_weight = self.counting_context_weight(count_pred) + hidden_states = hidden_states + 0.5 * count_context_weight.unsqueeze(1) + + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != len(self.layers): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {attn_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + head_mask[idx] if head_mask is not None else None, + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + None, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The bare MBART Model outputting raw hidden-states without any specific head on top.", + MBART_START_DOCSTRING, +) +class UnimerMBartModel(UnimerMBartPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: UnimerMBartConfig): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) + + self.encoder = UnimerMBartEncoder(config, self.shared) + self.decoder = UnimerMBartDecoder(config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.encoder.embed_tokens, self.get_input_embeddings()) + self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings()) + + @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Seq2SeqModelOutput, Tuple[torch.FloatTensor]]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # different to other models, MBart automatically creates decoder_input_ids from + # input_ids if no decoder_input_ids are provided + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id) + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.", + MBART_START_DOCSTRING, +) +class UnimerMBartForConditionalGeneration(UnimerMBartPreTrainedModel, GenerationMixin): + base_model_prefix = "model" + _keys_to_ignore_on_load_missing = ["final_logits_bias"] + _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"] + + def __init__(self, config: UnimerMBartConfig): + super().__init__(config) + self.model = UnimerMBartModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + self._resize_final_logits_bias(new_embeddings.weight.shape[0]) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(MBART_GENERATION_EXAMPLE) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past_key_values=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + # cut decoder_input_ids if past is used + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if decoder_input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = decoder_input_ids.shape[1] - 1 + + decoder_input_ids = decoder_input_ids[:, remove_prefix_length:] + + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + # cached cross_attention states don't have to be reordered -> they are always the same + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + + layer_past[2:], + ) + return reordered_past + + +@add_start_docstrings( + """ + MBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """, + MBART_START_DOCSTRING, +) +class UnimerMBartForSequenceClassification(UnimerMBartPreTrainedModel): + _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"] + + def __init__(self, config: UnimerMBartConfig, **kwargs): + super().__init__(config, **kwargs) + self.model = UnimerMBartModel(config) + self.classification_head = UnimerMBartClassificationHead( + config.d_model, + config.d_model, + config.num_labels, + config.classifier_dropout, + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] # last hidden state + + eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device) + + if len(torch.unique_consecutive(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ + :, -1, : + ] + logits = self.classification_head(sentence_representation) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + MBART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + MBART_START_DOCSTRING, +) +class UnimerMBartForQuestionAnswering(UnimerMBartPreTrainedModel): + _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"] + + def __init__(self, config): + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + + self.model = UnimerMBartModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward + def forward( + self, + input_ids: torch.Tensor = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if start_positions is not None and end_positions is not None: + use_cache = False + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->MBart +class UnimerMBartDecoderWrapper(UnimerMBartPreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = UnimerMBartDecoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25 +class UnimerMBartForCausalLM(UnimerMBartPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + config = copy.deepcopy(config) + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.model = UnimerMBartDecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentionsAndCounting, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + count_gt: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, MBartForCausalLM + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25") + >>> model = MBartForCausalLM.from_pretrained("facebook/mbart-large-cc25", add_cross_attention=False) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] + >>> list(logits.shape) == expected_shape + True + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + count_pred = None + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + count_pred=count_pred, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentionsAndCounting( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + counting=count_pred, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs + ): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + + if past_key_values: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + # first step, decoder_cached_states are empty + return { + "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "use_cache": use_cache, + } + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b91b3be3580e7b484deb4deae3dfe880e477906 --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py @@ -0,0 +1,9 @@ +from .configuration_unimer_swin import UnimerSwinConfig +from .modeling_unimer_swin import UnimerSwinModel +from .image_processing_unimer_swin import UnimerSwinImageProcessor + +__all__ = [ + "UnimerSwinConfig", + "UnimerSwinModel", + "UnimerSwinImageProcessor", +] diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py new file mode 100644 index 0000000000000000000000000000000000000000..6c577e7c98dc6a9813af7c56ba15f78232283679 --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py @@ -0,0 +1,132 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Donut Swin Transformer model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + + +class UnimerSwinConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`UnimerSwinModel`]. It is used to instantiate a + Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the Donut + [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 4): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + embed_dim (`int`, *optional*, defaults to 96): + Dimensionality of patch embedding. + depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`): + Depth of each layer in the Transformer encoder. + num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`): + Number of attention heads in each layer of the Transformer encoder. + window_size (`int`, *optional*, defaults to 7): + Size of windows. + mlp_ratio (`float`, *optional*, defaults to 4.0): + Ratio of MLP hidden dimensionality to embedding dimensionality. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether or not a learnable bias should be added to the queries, keys and values. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings and encoder. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + drop_path_rate (`float`, *optional*, defaults to 0.1): + Stochastic depth rate. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`, + `"selu"` and `"gelu_new"` are supported. + use_absolute_embeddings (`bool`, *optional*, defaults to `False`): + Whether or not to add absolute position embeddings to the patch embeddings. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + + Example: + + ```python + >>> from transformers import UnimerSwinConfig, UnimerSwinModel + + >>> # Initializing a Donut naver-clova-ix/donut-base style configuration + >>> configuration = UnimerSwinConfig() + + >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration + >>> model = UnimerSwinModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "unimer-swin" + + attribute_map = { + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + } + + def __init__( + self, + image_size=224, + patch_size=4, + num_channels=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + use_absolute_embeddings=False, + initializer_range=0.02, + layer_norm_eps=1e-5, + **kwargs, + ): + super().__init__(**kwargs) + + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_layers = len(depths) + self.num_heads = num_heads + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.use_absolute_embeddings = use_absolute_embeddings + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel + # this indicates the channel dimension after the last stage of the model + self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py new file mode 100644 index 0000000000000000000000000000000000000000..98d1deeebcad8667e3cf86758eb3405a426d2c7e --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py @@ -0,0 +1,174 @@ +from PIL import Image, ImageOps +from transformers.image_processing_utils import BaseImageProcessor +import numpy as np +import cv2 +import albumentations as alb +from albumentations.pytorch import ToTensorV2 +from torchvision.transforms.functional import resize + + +# TODO: dereference cv2 if possible +class UnimerSwinImageProcessor(BaseImageProcessor): + def __init__( + self, + image_size = (192, 672), + ): + self.input_size = [int(_) for _ in image_size] + assert len(self.input_size) == 2 + + self.transform = alb.Compose( + [ + alb.ToGray(), + alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)), + # alb.Sharpen() + ToTensorV2(), + ] + ) + + def __call__(self, item): + image = self.prepare_input(item) + return self.transform(image=image)['image'][:1] + + @staticmethod + def crop_margin(img: Image.Image) -> Image.Image: + data = np.array(img.convert("L")) + data = data.astype(np.uint8) + max_val = data.max() + min_val = data.min() + if max_val == min_val: + return img + data = (data - min_val) / (max_val - min_val) * 255 + gray = 255 * (data < 200).astype(np.uint8) + + coords = cv2.findNonZero(gray) # Find all non-zero points (text) + a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box + return img.crop((a, b, w + a, h + b)) + + @staticmethod + def crop_margin_numpy(img: np.ndarray) -> np.ndarray: + """Crop margins of image using NumPy operations""" + # Convert to grayscale if it's a color image + if len(img.shape) == 3 and img.shape[2] == 3: + gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + else: + gray = img.copy() + + # Normalize and threshold + if gray.max() == gray.min(): + return img + + normalized = (((gray - gray.min()) / (gray.max() - gray.min())) * 255).astype(np.uint8) + binary = 255 * (normalized < 200).astype(np.uint8) + + # Find bounding box + coords = cv2.findNonZero(binary) # Find all non-zero points (text) + x, y, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box + + # Return cropped image + return img[y:y + h, x:x + w] + + def prepare_input(self, img, random_padding: bool = False): + """ + Convert PIL Image or numpy array to properly sized and padded image after: + - crop margins + - resize while maintaining aspect ratio + - pad to target size + """ + if img is None: + return None + + # Handle numpy array + elif isinstance(img, np.ndarray): + try: + img = self.crop_margin_numpy(img) + except Exception: + # might throw an error for broken files + return None + + if img.shape[0] == 0 or img.shape[1] == 0: + return None + + # Get current dimensions + h, w = img.shape[:2] + target_h, target_w = self.input_size + + # Calculate scale to preserve aspect ratio (equivalent to resize + thumbnail) + scale = min(target_h / h, target_w / w) + + # Calculate new dimensions + new_h, new_w = int(h * scale), int(w * scale) + + # Resize the image while preserving aspect ratio + resized_img = cv2.resize(img, (new_w, new_h)) + + # Calculate padding values using the existing method + delta_width = target_w - new_w + delta_height = target_h - new_h + + pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding) + + # Apply padding (convert PIL padding format to OpenCV format) + padding_color = [0, 0, 0] if len(img.shape) == 3 else [0] + + padded_img = cv2.copyMakeBorder( + resized_img, + pad_height, # top + delta_height - pad_height, # bottom + pad_width, # left + delta_width - pad_width, # right + cv2.BORDER_CONSTANT, + value=padding_color + ) + + return padded_img + + # Handle PIL Image + elif isinstance(img, Image.Image): + try: + img = self.crop_margin(img.convert("RGB")) + except OSError: + # might throw an error for broken files + return None + + if img.height == 0 or img.width == 0: + return None + + # Resize while preserving aspect ratio + img = resize(img, min(self.input_size)) + img.thumbnail((self.input_size[1], self.input_size[0])) + new_w, new_h = img.width, img.height + + # Calculate and apply padding + padding = self._calculate_padding(new_w, new_h, random_padding) + return np.array(ImageOps.expand(img, padding)) + + else: + return None + + def _calculate_padding(self, new_w, new_h, random_padding): + """Calculate padding values for PIL images""" + delta_width = self.input_size[1] - new_w + delta_height = self.input_size[0] - new_h + + pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding) + + return ( + pad_width, + pad_height, + delta_width - pad_width, + delta_height - pad_height, + ) + + def _get_padding_values(self, new_w, new_h, random_padding): + """Get padding values based on image dimensions and padding strategy""" + delta_width = self.input_size[1] - new_w + delta_height = self.input_size[0] - new_h + + if random_padding: + pad_width = np.random.randint(low=0, high=delta_width + 1) + pad_height = np.random.randint(low=0, high=delta_height + 1) + else: + pad_width = delta_width // 2 + pad_height = delta_height // 2 + + return pad_width, pad_height diff --git a/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py new file mode 100644 index 0000000000000000000000000000000000000000..1b808e8bdc2b2c760598ca5d0dbd2705e42f1072 --- /dev/null +++ b/vendor/mineru/mineru/model/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py @@ -0,0 +1,1084 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch UnimerSwin Transformer model. + +This implementation is identical to a regular Swin Transformer, without final layer norm on top of the final hidden +states.""" + +import collections.abc +import math +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from transformers.activations import ACT2FN +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer +from transformers.utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + torch_int, +) +from .configuration_unimer_swin import UnimerSwinConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "UnimerSwinConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "https://huggingface.co/naver-clova-ix/donut-base" +_EXPECTED_OUTPUT_SHAPE = [1, 49, 768] + + +@dataclass +# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->UnimerSwin +class UnimerSwinEncoderOutput(ModelOutput): + """ + UnimerSwin encoder's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->UnimerSwin +class UnimerSwinModelOutput(ModelOutput): + """ + UnimerSwin model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed): + Average pooling of the last layer hidden-state. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of + shape `(batch_size, hidden_size, height, width)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to + include the spatial dimensions. + """ + + last_hidden_state: torch.FloatTensor = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + reshaped_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + + +# Copied from transformers.models.swin.modeling_swin.window_partition +def window_partition(input_feature, window_size): + """ + Partitions the given input into windows. + """ + batch_size, height, width, num_channels = input_feature.shape + input_feature = input_feature.view( + batch_size, height // window_size, window_size, width // window_size, window_size, num_channels + ) + windows = input_feature.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels) + return windows + + +# Copied from transformers.models.swin.modeling_swin.window_reverse +def window_reverse(windows, window_size, height, width): + """ + Merges windows to produce higher resolution features. + """ + num_channels = windows.shape[-1] + windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels) + windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels) + return windows + + +# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->UnimerSwin +class UnimerSwinEmbeddings(nn.Module): + """ + Construct the patch and position embeddings. Optionally, also the mask token. + """ + + def __init__(self, config, use_mask_token=False): + super().__init__() + + self.patch_embeddings = UnimerSwinPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.patch_grid = self.patch_embeddings.grid_size + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.embed_dim)) if use_mask_token else None + + if config.use_absolute_embeddings: + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.embed_dim)) + else: + self.position_embeddings = None + + ### code added. ### + if config.use_2d_embeddings: + self.row_embeddings = nn.Parameter(torch.zeros(1, self.patch_grid[0] + 1, config.embed_dim)) + self.column_embeddings = nn.Parameter(torch.zeros(1, self.patch_grid[1] + 1, config.embed_dim)) + else: + self.row_embeddings = None + self.column_embeddings = None + ###### + + self.norm = nn.LayerNorm(config.embed_dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher + resolution images. + + Source: + https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + if num_patches == num_positions and height == width: + return self.position_embeddings + class_pos_embed = self.position_embeddings[:, 0] + patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] + h0 = height // self.config.patch_size + w0 = width // self.config.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + h0, w0 = h0 + 0.1, w0 + 0.1 + patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)), + mode="bicubic", + align_corners=False, + ) + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + def forward( + self, + pixel_values: Optional[torch.FloatTensor], + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> Tuple[torch.Tensor]: + _, num_channels, height, width = pixel_values.shape + embeddings, output_dimensions = self.patch_embeddings(pixel_values) + embeddings = self.norm(embeddings) + batch_size, seq_len, _ = embeddings.size() + + if bool_masked_pos is not None: + mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + if self.position_embeddings is not None: + # if interpolate_pos_encoding: + # embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + # else: + # embeddings = embeddings + self.position_embeddings + embeddings = embeddings + self.position_embeddings[:, :seq_len, :] # code edited. + + ### code added. ### + if self.row_embeddings is not None and self.column_embeddings is not None: + # Repeat the x position embeddings across the y axis like 0, 1, 2, 3, 0, 1, 2, 3, ... + row_embeddings = self.row_embeddings[:, :output_dimensions[0], :].repeat_interleave(output_dimensions[1], dim=1) + column_embeddings = self.column_embeddings[:, :output_dimensions[1], :].repeat(1, output_dimensions[0], 1) + embeddings = embeddings + row_embeddings + column_embeddings + ###### + + embeddings = self.dropout(embeddings) + + return embeddings, output_dimensions + +class StemLayer(nn.Module): + r""" Stem layer of InternImage + Args: + in_chans (int): number of input channels + out_chans (int): number of output channels + act_layer (str): activation layer + norm_layer (str): normalization layer + """ + + def __init__(self, in_chans=3, out_chans=96, act_layer=nn.GELU, norm_layer='BN'): + super().__init__() + self.conv1 = nn.Conv2d(in_chans, out_chans // 2, kernel_size=3, stride=2, padding=1) + self.norm1 = self.build_norm_layer(out_chans // 2, norm_layer) + self.act = act_layer() + self.conv2 = nn.Conv2d(out_chans // 2, out_chans, kernel_size=3, stride=2, padding=1) + + def build_norm_layer(self, dim, norm_layer): + layers = [] + if norm_layer == 'BN': + layers.append(nn.BatchNorm2d(dim)) + else: + raise NotImplementedError(f'build_norm_layer does not support {norm_layer}') + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.norm1(x) + x = self.act(x) + x = self.conv2(x) + return x + +# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings with Swin->UnimerSwin +class UnimerSwinPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.embed_dim + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) + + ### code edited. ### + # self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + self.projection = StemLayer(in_chans=num_channels, out_chans=hidden_size) + ### + + def maybe_pad(self, pixel_values, height, width): + if width % self.patch_size[1] != 0: + pad_values = (0, self.patch_size[1] - width % self.patch_size[1]) + pixel_values = nn.functional.pad(pixel_values, pad_values) + if height % self.patch_size[0] != 0: + pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0]) + pixel_values = nn.functional.pad(pixel_values, pad_values) + return pixel_values + + def forward(self, pixel_values: Optional[torch.FloatTensor]) -> Tuple[torch.Tensor, Tuple[int]]: + _, num_channels, height, width = pixel_values.shape + # pad the input to be divisible by self.patch_size, if needed + pixel_values = self.maybe_pad(pixel_values, height, width) + embeddings = self.projection(pixel_values) + _, _, height, width = embeddings.shape + output_dimensions = (height, width) + embeddings = embeddings.flatten(2).transpose(1, 2) + + return embeddings, output_dimensions + + +# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging +class UnimerSwinPatchMerging(nn.Module): + """ + Patch Merging Layer. + + Args: + input_resolution (`Tuple[int]`): + Resolution of input feature. + dim (`int`): + Number of input channels. + norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer class. + """ + + def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None: + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def maybe_pad(self, input_feature, height, width): + should_pad = (height % 2 == 1) or (width % 2 == 1) + if should_pad: + pad_values = (0, 0, 0, width % 2, 0, height % 2) + input_feature = nn.functional.pad(input_feature, pad_values) + + return input_feature + + def forward(self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int]) -> torch.Tensor: + height, width = input_dimensions + # `dim` is height * width + batch_size, dim, num_channels = input_feature.shape + + input_feature = input_feature.view(batch_size, height, width, num_channels) + # pad input to be disible by width and height, if needed + input_feature = self.maybe_pad(input_feature, height, width) + # [batch_size, height/2, width/2, num_channels] + input_feature_0 = input_feature[:, 0::2, 0::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_1 = input_feature[:, 1::2, 0::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_2 = input_feature[:, 0::2, 1::2, :] + # [batch_size, height/2, width/2, num_channels] + input_feature_3 = input_feature[:, 1::2, 1::2, :] + # batch_size height/2 width/2 4*num_channels + input_feature = torch.cat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1) + input_feature = input_feature.view(batch_size, -1, 4 * num_channels) # batch_size height/2*width/2 4*C + + input_feature = self.norm(input_feature) + input_feature = self.reduction(input_feature) + + return input_feature + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.swin.modeling_swin.SwinDropPath +class UnimerSwinDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +# Copied from transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->UnimerSwin +class UnimerSwinSelfAttention(nn.Module): + def __init__(self, config, dim, num_heads, window_size): + super().__init__() + if dim % num_heads != 0: + raise ValueError( + f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.window_size = ( + window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size) + ) + + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads) + ) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(meshgrid([coords_h, coords_w], indexing="ij")) + coords_flatten = torch.flatten(coords, 1) + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] + relative_coords = relative_coords.permute(1, 2, 0).contiguous() + relative_coords[:, :, 0] += self.window_size[0] - 1 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) + self.register_buffer("relative_position_index", relative_position_index) + + self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + batch_size, dim, num_channels = hidden_states.shape + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)] + relative_position_bias = relative_position_bias.view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + ) + + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() + attention_scores = attention_scores + relative_position_bias.unsqueeze(0) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in UnimerSwinModel forward() function) + mask_shape = attention_mask.shape[0] + attention_scores = attention_scores.view( + batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim + ) + attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0) + attention_scores = attention_scores.view(-1, self.num_attention_heads, dim, dim) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput +class UnimerSwinSelfOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, dim) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->UnimerSwin +class UnimerSwinAttention(nn.Module): + def __init__(self, config, dim, num_heads, window_size): + super().__init__() + self.self = UnimerSwinSelfAttention(config, dim, num_heads, window_size) + self.output = UnimerSwinSelfOutput(config, dim) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinIntermediate +class UnimerSwinIntermediate(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, int(config.mlp_ratio * dim)) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinOutput +class UnimerSwinOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(int(config.mlp_ratio * dim), dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class ConvEnhance(nn.Module): + """Depth-wise convolution to get the positional information. + """ + def __init__(self, config, dim, k=3): + super(ConvEnhance, self).__init__() + self.proj = nn.Conv2d(dim, + dim, + (k,k), + (1,1), + (k // 2,k // 2), + groups=dim) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x, size: Tuple[int, int]): + B, N, C = x.shape + H, W = size + assert N == H * W + + feat = x.transpose(1, 2).view(B, C, H, W) + feat = self.proj(feat) + feat = self.act_fn(feat) + feat = feat.flatten(2).transpose(1, 2) + + x = x + feat + return x + + +# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->UnimerSwin +class UnimerSwinLayer(nn.Module): + def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.shift_size = shift_size + self.window_size = config.window_size + self.input_resolution = input_resolution + self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) + + self.ce = nn.ModuleList([ConvEnhance(config, dim=dim, k=3), + ConvEnhance(config, dim=dim, k=3)]) + + self.attention = UnimerSwinAttention(config, dim, num_heads, window_size=self.window_size) + self.drop_path = UnimerSwinDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() + self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.intermediate = UnimerSwinIntermediate(config, dim) + self.output = UnimerSwinOutput(config, dim) + + def set_shift_and_window_size(self, input_resolution): + if min(input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = torch_int(0) + self.window_size = ( + torch.min(torch.tensor(input_resolution)) if torch.jit.is_tracing() else min(input_resolution) + ) + + def get_attn_mask(self, height, width, dtype, device): + if self.shift_size > 0: + # calculate attention mask for SW-MSA + img_mask = torch.zeros((1, height, width, 1), dtype=dtype, device=device) + height_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + width_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + count = 0 + for height_slice in height_slices: + for width_slice in width_slices: + img_mask[:, height_slice, width_slice, :] = count + count += 1 + + mask_windows = window_partition(img_mask, self.window_size) + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + else: + attn_mask = None + return attn_mask + + def maybe_pad(self, hidden_states, height, width): + pad_right = (self.window_size - width % self.window_size) % self.window_size + pad_bottom = (self.window_size - height % self.window_size) % self.window_size + pad_values = (0, 0, 0, pad_right, 0, pad_bottom) + hidden_states = nn.functional.pad(hidden_states, pad_values) + return hidden_states, pad_values + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: Tuple[int, int], + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + always_partition: Optional[bool] = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + if not always_partition: + self.set_shift_and_window_size(input_dimensions) + else: + pass + height, width = input_dimensions + batch_size, _, channels = hidden_states.size() + + + + hidden_states = self.ce[0](hidden_states, input_dimensions) + shortcut = hidden_states + + + hidden_states = self.layernorm_before(hidden_states) + hidden_states = hidden_states.view(batch_size, height, width, channels) + + # pad hidden_states to multiples of window size + hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) + + _, height_pad, width_pad, _ = hidden_states.shape + # cyclic shift + if self.shift_size > 0: + shifted_hidden_states = torch.roll(hidden_states, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + else: + shifted_hidden_states = hidden_states + + # partition windows + hidden_states_windows = window_partition(shifted_hidden_states, self.window_size) + hidden_states_windows = hidden_states_windows.view(-1, self.window_size * self.window_size, channels) + attn_mask = self.get_attn_mask( + height_pad, width_pad, dtype=hidden_states.dtype, device=hidden_states_windows.device + ) + + attention_outputs = self.attention( + hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions + ) + + attention_output = attention_outputs[0] + + attention_windows = attention_output.view(-1, self.window_size, self.window_size, channels) + shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad) + + # reverse cyclic shift + if self.shift_size > 0: + attention_windows = torch.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + attention_windows = shifted_windows + + was_padded = pad_values[3] > 0 or pad_values[5] > 0 + if was_padded: + attention_windows = attention_windows[:, :height, :width, :].contiguous() + + attention_windows = attention_windows.view(batch_size, height * width, channels) + + hidden_states = shortcut + self.drop_path(attention_windows) + + + + hidden_states = self.ce[1](hidden_states, input_dimensions) + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + layer_output = hidden_states + self.output(layer_output) + + layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,) + return layer_outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->UnimerSwin +class UnimerSwinStage(nn.Module): + def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample): + super().__init__() + self.config = config + self.dim = dim + self.blocks = nn.ModuleList( + [ + UnimerSwinLayer( + config=config, + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + shift_size=0, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm) + else: + self.downsample = None + + self.pointing = False + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: Tuple[int, int], + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + always_partition: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + height, width = input_dimensions + for i, layer_module in enumerate(self.blocks): + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition + ) + + hidden_states = layer_outputs[0] + + hidden_states_before_downsampling = hidden_states + if self.downsample is not None: + height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2 + output_dimensions = (height, width, height_downsampled, width_downsampled) + hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions) + else: + output_dimensions = (height, width, height, width) + + stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions) + + if output_attentions: + stage_outputs += layer_outputs[1:] + return stage_outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinEncoder with Swin->UnimerSwin +class UnimerSwinEncoder(nn.Module): + def __init__(self, config, grid_size): + super().__init__() + self.num_layers = len(config.depths) + self.config = config + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths))] + self.layers = nn.ModuleList( + [ + UnimerSwinStage( + config=config, + dim=int(config.embed_dim * 2**i_layer), + input_resolution=(grid_size[0] // (2**i_layer), grid_size[1] // (2**i_layer)), + depth=config.depths[i_layer], + num_heads=config.num_heads[i_layer], + drop_path=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])], + downsample=UnimerSwinPatchMerging if (i_layer < self.num_layers - 1) else None, + ) + for i_layer in range(self.num_layers) + ] + ) + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: Tuple[int, int], + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + output_hidden_states_before_downsampling: Optional[bool] = False, + always_partition: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple, UnimerSwinEncoderOutput]: + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + batch_size, _, hidden_size = hidden_states.shape + # rearrange b (h w) c -> b c h w + reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + for i, layer_module in enumerate(self.layers): + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + input_dimensions, + layer_head_mask, + output_attentions, + always_partition, + ) + else: + layer_outputs = layer_module( + hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition + ) + + hidden_states = layer_outputs[0] + hidden_states_before_downsampling = layer_outputs[1] + output_dimensions = layer_outputs[2] + + input_dimensions = (output_dimensions[-2], output_dimensions[-1]) + + if output_hidden_states and output_hidden_states_before_downsampling: + batch_size, _, hidden_size = hidden_states_before_downsampling.shape + # rearrange b (h w) c -> b c h w + # here we use the original (not downsampled) height and width + reshaped_hidden_state = hidden_states_before_downsampling.view( + batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size + ) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states_before_downsampling,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + elif output_hidden_states and not output_hidden_states_before_downsampling: + batch_size, _, hidden_size = hidden_states.shape + # rearrange b (h w) c -> b c h w + reshaped_hidden_state = hidden_states.view(batch_size, *input_dimensions, hidden_size) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + if output_attentions: + all_self_attentions += layer_outputs[3:] + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + + return UnimerSwinEncoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + reshaped_hidden_states=all_reshaped_hidden_states, + ) + + +# Copied from transformers.models.swin.modeling_swin.SwinPreTrainedModel with Swin->UnimerSwin +class UnimerSwinPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = UnimerSwinConfig + base_model_prefix = "unimer-swin" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["UnimerSwinStage"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +SWIN_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`UnimerSwinConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +SWIN_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`DonutImageProcessor.__call__`] for details. + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare UnimerSwin Model transformer outputting raw hidden-states without any specific head on top.", + SWIN_START_DOCSTRING, +) +class UnimerSwinModel(UnimerSwinPreTrainedModel): + def __init__(self, config, add_pooling_layer=True, use_mask_token=False): + super().__init__(config) + self.config = config + self.num_layers = len(config.depths) + self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1)) + + self.embeddings = UnimerSwinEmbeddings(config, use_mask_token=use_mask_token) + self.encoder = UnimerSwinEncoder(config, self.embeddings.patch_grid) + self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(SWIN_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=UnimerSwinModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, UnimerSwinModelOutput]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, len(self.config.depths)) + + embedding_output, input_dimensions = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding + ) + + encoder_outputs = self.encoder( + embedding_output, + input_dimensions, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + + pooled_output = None + if self.pooler is not None: + pooled_output = self.pooler(sequence_output.transpose(1, 2)) + pooled_output = torch.flatten(pooled_output, 1) + + if not return_dict: + output = (sequence_output, pooled_output) + encoder_outputs[1:] + + return output + + return UnimerSwinModelOutput( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, + ) diff --git a/vendor/mineru/mineru/model/ocr/__init__.py b/vendor/mineru/mineru/model/ocr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py new file mode 100644 index 0000000000000000000000000000000000000000..de56ca3ac405b6b01c6174308da32aa951af596e --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorch_paddle.py @@ -0,0 +1,209 @@ +# Copyright (c) Opendatalab. All rights reserved. +import copy +import os +import warnings +from pathlib import Path + +import cv2 +import numpy as np +import yaml +from loguru import logger + +from mineru.utils.config_reader import get_device +from mineru.utils.enum_class import ModelPath +from mineru.utils.models_download_utils import auto_download_and_get_model_root_path +from ....utils.ocr_utils import check_img, preprocess_image, sorted_boxes, merge_det_boxes, update_det_boxes, get_rotate_crop_image +from .tools.infer.predict_system import TextSystem +from .tools.infer import pytorchocr_utility as utility +import argparse + + +latin_lang = [ + 'af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', # noqa: E126 + 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', + 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', + 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german' +] +arabic_lang = ['ar', 'fa', 'ug', 'ur'] +cyrillic_lang = [ + 'rs_cyrillic', 'bg', 'mn', 'abq', 'ady', 'kbd', 'ava', # noqa: E126 + 'dar', 'inh', 'che', 'lbe', 'lez', 'tab' +] +east_slavic_lang = ["ru", "be", "uk"] +devanagari_lang = [ + 'hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', # noqa: E126 + 'sa', 'bgc' +] + + +def get_model_params(lang, config): + if lang in config['lang']: + params = config['lang'][lang] + det = params.get('det') + rec = params.get('rec') + dict_file = params.get('dict') + return det, rec, dict_file + else: + raise Exception (f'Language {lang} not supported') + + +root_dir = Path(__file__).resolve().parent + + +class PytorchPaddleOCR(TextSystem): + def __init__(self, *args, **kwargs): + parser = utility.init_args() + args = parser.parse_args(args) + + self.lang = kwargs.get('lang', 'ch') + + device = get_device() + if device == 'cpu' and self.lang in ['ch', 'ch_server', 'japan', 'chinese_cht']: + # logger.warning("The current device in use is CPU. To ensure the speed of parsing, the language is automatically switched to ch_lite.") + self.lang = 'ch_lite' + + if self.lang in latin_lang: + self.lang = 'latin' + elif self.lang in arabic_lang: + self.lang = 'arabic' + elif self.lang in cyrillic_lang: + self.lang = 'cyrillic' + elif self.lang in devanagari_lang: + self.lang = 'devanagari' + elif self.lang in east_slavic_lang: + self.lang = 'east_slavic' + else: + pass + + models_config_path = os.path.join(root_dir, 'pytorchocr', 'utils', 'resources', 'models_config.yml') + with open(models_config_path) as file: + config = yaml.safe_load(file) + det, rec, dict_file = get_model_params(self.lang, config) + ocr_models_dir = ModelPath.pytorch_paddle + + det_model_path = f"{ocr_models_dir}/{det}" + det_model_path = os.path.join(auto_download_and_get_model_root_path(det_model_path), det_model_path) + rec_model_path = f"{ocr_models_dir}/{rec}" + rec_model_path = os.path.join(auto_download_and_get_model_root_path(rec_model_path), rec_model_path) + kwargs['det_model_path'] = det_model_path + kwargs['rec_model_path'] = rec_model_path + kwargs['rec_char_dict_path'] = os.path.join(root_dir, 'pytorchocr', 'utils', 'resources', 'dict', dict_file) + # kwargs['rec_batch_num'] = 8 + + kwargs['device'] = device + + default_args = vars(args) + default_args.update(kwargs) + args = argparse.Namespace(**default_args) + + super().__init__(args) + + def ocr(self, + img, + det=True, + rec=True, + mfd_res=None, + tqdm_enable=False, + ): + assert isinstance(img, (np.ndarray, list, str, bytes)) + if isinstance(img, list) and det == True: + logger.error('When input a list of images, det must be false') + exit(0) + img = check_img(img) + imgs = [img] + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=RuntimeWarning) + if det and rec: + ocr_res = [] + for img in imgs: + img = preprocess_image(img) + dt_boxes, rec_res = self.__call__(img, mfd_res=mfd_res) + if not dt_boxes and not rec_res: + ocr_res.append(None) + continue + tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] + ocr_res.append(tmp_res) + return ocr_res + elif det and not rec: + ocr_res = [] + for img in imgs: + img = preprocess_image(img) + dt_boxes, elapse = self.text_detector(img) + # logger.debug("dt_boxes num : {}, elapsed : {}".format(len(dt_boxes), elapse)) + if dt_boxes is None: + ocr_res.append(None) + continue + dt_boxes = sorted_boxes(dt_boxes) + # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly,因此需要过滤所有倾斜程度较大的文本框 + dt_boxes = merge_det_boxes(dt_boxes) + if mfd_res: + dt_boxes = update_det_boxes(dt_boxes, mfd_res) + tmp_res = [box.tolist() for box in dt_boxes] + ocr_res.append(tmp_res) + return ocr_res + elif not det and rec: + ocr_res = [] + for img in imgs: + if not isinstance(img, list): + img = preprocess_image(img) + img = [img] + rec_res, elapse = self.text_recognizer(img, tqdm_enable=tqdm_enable) + # logger.debug("rec_res num : {}, elapsed : {}".format(len(rec_res), elapse)) + ocr_res.append(rec_res) + return ocr_res + + def __call__(self, img, mfd_res=None): + + if img is None: + logger.debug("no valid image provided") + return None, None + + ori_im = img.copy() + dt_boxes, elapse = self.text_detector(img) + + if dt_boxes is None: + logger.debug("no dt_boxes found, elapsed : {}".format(elapse)) + return None, None + else: + pass + # logger.debug("dt_boxes num : {}, elapsed : {}".format(len(dt_boxes), elapse)) + img_crop_list = [] + + dt_boxes = sorted_boxes(dt_boxes) + + # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly,因此需要过滤所有倾斜程度较大的文本框 + dt_boxes = merge_det_boxes(dt_boxes) + + if mfd_res: + dt_boxes = update_det_boxes(dt_boxes, mfd_res) + + for bno in range(len(dt_boxes)): + tmp_box = copy.deepcopy(dt_boxes[bno]) + img_crop = get_rotate_crop_image(ori_im, tmp_box) + img_crop_list.append(img_crop) + + rec_res, elapse = self.text_recognizer(img_crop_list) + # logger.debug("rec_res num : {}, elapsed : {}".format(len(rec_res), elapse)) + + filter_boxes, filter_rec_res = [], [] + for box, rec_result in zip(dt_boxes, rec_res): + text, score = rec_result + if score >= self.drop_score: + filter_boxes.append(box) + filter_rec_res.append(rec_result) + + return filter_boxes, filter_rec_res + +if __name__ == '__main__': + pytorch_paddle_ocr = PytorchPaddleOCR() + img = cv2.imread("/Users/myhloli/Downloads/screenshot-20250326-194348.png") + dt_boxes, rec_res = pytorch_paddle_ocr(img) + ocr_res = [] + if not dt_boxes and not rec_res: + ocr_res.append(None) + else: + tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)] + ocr_res.append(tmp_res) + print(ocr_res) + + diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py new file mode 100755 index 0000000000000000000000000000000000000000..c169d20db9d3b3ea799e1c304ce8684cd8f12362 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py @@ -0,0 +1,39 @@ +import os +import torch +from .modeling.architectures.base_model import BaseModel + +class BaseOCRV20: + def __init__(self, config, **kwargs): + self.config = config + self.build_net(**kwargs) + self.net.eval() + + + def build_net(self, **kwargs): + self.net = BaseModel(self.config, **kwargs) + + def read_pytorch_weights(self, weights_path): + if not os.path.exists(weights_path): + raise FileNotFoundError('{} is not existed.'.format(weights_path)) + weights = torch.load(weights_path) + return weights + + def get_out_channels(self, weights): + if list(weights.keys())[-1].endswith('.weight') and len(list(weights.values())[-1].shape) == 2: + out_channels = list(weights.values())[-1].numpy().shape[1] + else: + out_channels = list(weights.values())[-1].numpy().shape[0] + return out_channels + + def load_state_dict(self, weights): + self.net.load_state_dict(weights) + # print('weights is loaded.') + + def load_pytorch_weights(self, weights_path): + self.net.load_state_dict(torch.load(weights_path, weights_only=True)) + # print('model is loaded: {}'.format(weights_path)) + + def inference(self, inputs): + with torch.no_grad(): + infer = self.net(inputs) + return infer diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..9eef2969a0854c6fc295c3696ba153d300e7c2f1 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py @@ -0,0 +1,8 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from .imaug import transform, create_operators + + diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..13abd6741c581fcb6d042854404f65c49213e9d9 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py @@ -0,0 +1,48 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +# from .iaa_augment import IaaAugment +# from .make_border_map import MakeBorderMap +# from .make_shrink_map import MakeShrinkMap +# from .random_crop_data import EastRandomCropData, PSERandomCrop + +# from .rec_img_aug import RecAug, RecResizeImg, ClsResizeImg +# from .randaugment import RandAugment +from .operators import * +# from .label_ops import * + +# from .east_process import * +# from .sast_process import * +# from .gen_table_mask import * + +def transform(data, ops=None): + """ transform """ + if ops is None: + ops = [] + for op in ops: + data = op(data) + if data is None: + return None + return data + + +def create_operators(op_param_list, global_config=None): + """ + create operators based on the config + Args: + params(list): a dict list, used to create some operators + """ + assert isinstance(op_param_list, list), ('operator config should be a list') + ops = [] + for operator in op_param_list: + assert isinstance(operator, + dict) and len(operator) == 1, "yaml format error" + op_name = list(operator)[0] + param = {} if operator[op_name] is None else operator[op_name] + if global_config is not None: + param.update(global_config) + op = eval(op_name)(**param) + ops.append(op) + return ops \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py new file mode 100755 index 0000000000000000000000000000000000000000..daa67a25dae93dde74fc0b92aad4aa6ef4d4c003 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py @@ -0,0 +1,418 @@ +""" +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import sys +import six +import cv2 +import numpy as np + + +class DecodeImage(object): + """ decode image """ + + def __init__(self, img_mode='RGB', channel_first=False, **kwargs): + self.img_mode = img_mode + self.channel_first = channel_first + + def __call__(self, data): + img = data['image'] + if six.PY2: + assert type(img) is str and len( + img) > 0, "invalid input 'img' in DecodeImage" + else: + assert type(img) is bytes and len( + img) > 0, "invalid input 'img' in DecodeImage" + img = np.frombuffer(img, dtype='uint8') + img = cv2.imdecode(img, 1) + if img is None: + return None + if self.img_mode == 'GRAY': + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif self.img_mode == 'RGB': + assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) + img = img[:, :, ::-1] + + if self.channel_first: + img = img.transpose((2, 0, 1)) + + data['image'] = img + return data + + +class NRTRDecodeImage(object): + """ decode image """ + + def __init__(self, img_mode='RGB', channel_first=False, **kwargs): + self.img_mode = img_mode + self.channel_first = channel_first + + def __call__(self, data): + img = data['image'] + if six.PY2: + assert type(img) is str and len( + img) > 0, "invalid input 'img' in DecodeImage" + else: + assert type(img) is bytes and len( + img) > 0, "invalid input 'img' in DecodeImage" + img = np.frombuffer(img, dtype='uint8') + + img = cv2.imdecode(img, 1) + + if img is None: + return None + if self.img_mode == 'GRAY': + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif self.img_mode == 'RGB': + assert img.shape[2] == 3, 'invalid shape of image[%s]' % (img.shape) + img = img[:, :, ::-1] + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + if self.channel_first: + img = img.transpose((2, 0, 1)) + data['image'] = img + return data + + +class NormalizeImage(object): + """ normalize image such as substract mean, divide std + """ + + def __init__(self, scale=None, mean=None, std=None, order='chw', **kwargs): + if isinstance(scale, str): + scale = eval(scale) + self.scale = np.float32(scale if scale is not None else 1.0 / 255.0) + mean = mean if mean is not None else [0.485, 0.456, 0.406] + std = std if std is not None else [0.229, 0.224, 0.225] + + shape = (3, 1, 1) if order == 'chw' else (1, 1, 3) + self.mean = np.array(mean).reshape(shape).astype('float32') + self.std = np.array(std).reshape(shape).astype('float32') + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + assert isinstance(img, + np.ndarray), "invalid input 'img' in NormalizeImage" + data['image'] = ( + img.astype('float32') * self.scale - self.mean) / self.std + return data + + +class ToCHWImage(object): + """ convert hwc image to chw image + """ + + def __init__(self, **kwargs): + pass + + def __call__(self, data): + img = data['image'] + from PIL import Image + if isinstance(img, Image.Image): + img = np.array(img) + data['image'] = img.transpose((2, 0, 1)) + return data + + +class Fasttext(object): + def __init__(self, path="None", **kwargs): + import fasttext + self.fast_model = fasttext.load_model(path) + + def __call__(self, data): + label = data['label'] + fast_label = self.fast_model[label] + data['fast_label'] = fast_label + return data + + +class KeepKeys(object): + def __init__(self, keep_keys, **kwargs): + self.keep_keys = keep_keys + + def __call__(self, data): + data_list = [] + for key in self.keep_keys: + data_list.append(data[key]) + return data_list + + +class Resize(object): + def __init__(self, size=(640, 640), **kwargs): + self.size = size + + def resize_image(self, img): + resize_h, resize_w = self.size + ori_h, ori_w = img.shape[:2] # (h, w, c) + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + return img, [ratio_h, ratio_w] + + def __call__(self, data): + img = data['image'] + text_polys = data['polys'] + + img_resize, [ratio_h, ratio_w] = self.resize_image(img) + new_boxes = [] + for box in text_polys: + new_box = [] + for cord in box: + new_box.append([cord[0] * ratio_w, cord[1] * ratio_h]) + new_boxes.append(new_box) + data['image'] = img_resize + data['polys'] = np.array(new_boxes, dtype=np.float32) + return data + + +class DetResizeForTest(object): + def __init__(self, **kwargs): + super(DetResizeForTest, self).__init__() + self.resize_type = 0 + if 'image_shape' in kwargs: + self.image_shape = kwargs['image_shape'] + self.resize_type = 1 + elif 'limit_side_len' in kwargs: + self.limit_side_len = kwargs['limit_side_len'] + self.limit_type = kwargs.get('limit_type', 'min') + elif 'resize_long' in kwargs: + self.resize_type = 2 + self.resize_long = kwargs.get('resize_long', 960) + else: + self.limit_side_len = 736 + self.limit_type = 'min' + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + + if self.resize_type == 0: + # img, shape = self.resize_image_type0(img) + img, [ratio_h, ratio_w] = self.resize_image_type0(img) + elif self.resize_type == 2: + img, [ratio_h, ratio_w] = self.resize_image_type2(img) + else: + # img, shape = self.resize_image_type1(img) + img, [ratio_h, ratio_w] = self.resize_image_type1(img) + data['image'] = img + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def resize_image_type1(self, img): + resize_h, resize_w = self.image_shape + ori_h, ori_w = img.shape[:2] # (h, w, c) + ratio_h = float(resize_h) / ori_h + ratio_w = float(resize_w) / ori_w + img = cv2.resize(img, (int(resize_w), int(resize_h))) + # return img, np.array([ori_h, ori_w]) + return img, [ratio_h, ratio_w] + + def resize_image_type0(self, img): + """ + resize image to a size multiple of 32 which is required by the network + args: + img(array): array with shape [h, w, c] + return(tuple): + img, (ratio_h, ratio_w) + """ + limit_side_len = self.limit_side_len + h, w, c = img.shape + + # limit the max side + if self.limit_type == 'max': + if max(h, w) > limit_side_len: + if h > w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'min': + if min(h, w) < limit_side_len: + if h < w: + ratio = float(limit_side_len) / h + else: + ratio = float(limit_side_len) / w + else: + ratio = 1. + elif self.limit_type == 'resize_long': + ratio = float(limit_side_len) / max(h, w) + else: + raise Exception('not support limit type, image ') + resize_h = int(h * ratio) + resize_w = int(w * ratio) + + resize_h = max(int(round(resize_h / 32) * 32), 32) + resize_w = max(int(round(resize_w / 32) * 32), 32) + + try: + if int(resize_w) <= 0 or int(resize_h) <= 0: + return None, (None, None) + img = cv2.resize(img, (int(resize_w), int(resize_h))) + except: + print(img.shape, resize_w, resize_h) + sys.exit(0) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return img, [ratio_h, ratio_w] + + def resize_image_type2(self, img): + h, w, _ = img.shape + + resize_w = w + resize_h = h + + if resize_h > resize_w: + ratio = float(self.resize_long) / resize_h + else: + ratio = float(self.resize_long) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + img = cv2.resize(img, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return img, [ratio_h, ratio_w] + + +class E2EResizeForTest(object): + def __init__(self, **kwargs): + super(E2EResizeForTest, self).__init__() + self.max_side_len = kwargs['max_side_len'] + self.valid_set = kwargs['valid_set'] + + def __call__(self, data): + img = data['image'] + src_h, src_w, _ = img.shape + if self.valid_set == 'totaltext': + im_resized, [ratio_h, ratio_w] = self.resize_image_for_totaltext( + img, max_side_len=self.max_side_len) + else: + im_resized, (ratio_h, ratio_w) = self.resize_image( + img, max_side_len=self.max_side_len) + data['image'] = im_resized + data['shape'] = np.array([src_h, src_w, ratio_h, ratio_w]) + return data + + def resize_image_for_totaltext(self, im, max_side_len=512): + + h, w, _ = im.shape + resize_w = w + resize_h = h + ratio = 1.25 + if h * ratio > max_side_len: + ratio = float(max_side_len) / resize_h + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + return im, (ratio_h, ratio_w) + + def resize_image(self, im, max_side_len=512): + """ + resize image to a size multiple of max_stride which is required by the network + :param im: the resized image + :param max_side_len: limit of max image size to avoid out of memory in gpu + :return: the resized image and the resize ratio + """ + h, w, _ = im.shape + + resize_w = w + resize_h = h + + # Fix the longer side + if resize_h > resize_w: + ratio = float(max_side_len) / resize_h + else: + ratio = float(max_side_len) / resize_w + + resize_h = int(resize_h * ratio) + resize_w = int(resize_w * ratio) + + max_stride = 128 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(im, (int(resize_w), int(resize_h))) + ratio_h = resize_h / float(h) + ratio_w = resize_w / float(w) + + return im, (ratio_h, ratio_w) + + +class KieResize(object): + def __init__(self, **kwargs): + super(KieResize, self).__init__() + self.max_side, self.min_side = kwargs['img_scale'][0], kwargs[ + 'img_scale'][1] + + def __call__(self, data): + img = data['image'] + points = data['points'] + src_h, src_w, _ = img.shape + im_resized, scale_factor, [ratio_h, ratio_w + ], [new_h, new_w] = self.resize_image(img) + resize_points = self.resize_boxes(img, points, scale_factor) + data['ori_image'] = img + data['ori_boxes'] = points + data['points'] = resize_points + data['image'] = im_resized + data['shape'] = np.array([new_h, new_w]) + return data + + def resize_image(self, img): + norm_img = np.zeros([1024, 1024, 3], dtype='float32') + scale = [512, 1024] + h, w = img.shape[:2] + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + resize_w, resize_h = int(w * float(scale_factor) + 0.5), int(h * float( + scale_factor) + 0.5) + max_stride = 32 + resize_h = (resize_h + max_stride - 1) // max_stride * max_stride + resize_w = (resize_w + max_stride - 1) // max_stride * max_stride + im = cv2.resize(img, (resize_w, resize_h)) + new_h, new_w = im.shape[:2] + w_scale = new_w / w + h_scale = new_h / h + scale_factor = np.array( + [w_scale, h_scale, w_scale, h_scale], dtype=np.float32) + norm_img[:new_h, :new_w, :] = im + return norm_img, scale_factor, [h_scale, w_scale], [new_h, new_w] + + def resize_boxes(self, im, points, scale_factor): + points = points * scale_factor + img_shape = im.shape[:2] + points[:, 0::2] = np.clip(points[:, 0::2], 0, img_shape[1]) + points[:, 1::2] = np.clip(points[:, 1::2], 0, img_shape[0]) + return points diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7ad5eb47c2efb04ef0b1ecdea9e2173acdf6706d --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +__all__ = ["build_model"] + + +def build_model(config, **kwargs): + from .base_model import BaseModel + + config = copy.deepcopy(config) + module_class = BaseModel(config, **kwargs) + return module_class diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..e7f7ce49b7201f99e050cb8d83b3eb0fb318760d --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py @@ -0,0 +1,105 @@ +from torch import nn + +from ..backbones import build_backbone +from ..heads import build_head +from ..necks import build_neck + + +class BaseModel(nn.Module): + def __init__(self, config, **kwargs): + """ + the module for OCR. + args: + config (dict): the super parameters for module. + """ + super(BaseModel, self).__init__() + + in_channels = config.get("in_channels", 3) + model_type = config["model_type"] + # build backbone, backbone is need for del, rec and cls + if "Backbone" not in config or config["Backbone"] is None: + self.use_backbone = False + else: + self.use_backbone = True + config["Backbone"]["in_channels"] = in_channels + self.backbone = build_backbone(config["Backbone"], model_type) + in_channels = self.backbone.out_channels + + # build neck + # for rec, neck can be cnn,rnn or reshape(None) + # for det, neck can be FPN, BIFPN and so on. + # for cls, neck should be none + if "Neck" not in config or config["Neck"] is None: + self.use_neck = False + else: + self.use_neck = True + config["Neck"]["in_channels"] = in_channels + self.neck = build_neck(config["Neck"]) + in_channels = self.neck.out_channels + + # # build head, head is need for det, rec and cls + if "Head" not in config or config["Head"] is None: + self.use_head = False + else: + self.use_head = True + config["Head"]["in_channels"] = in_channels + self.head = build_head(config["Head"], **kwargs) + + self.return_all_feats = config.get("return_all_feats", False) + + self._initialize_weights() + + def _initialize_weights(self): + # weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + + def forward(self, x): + y = dict() + if self.use_backbone: + x = self.backbone(x) + if isinstance(x, dict): + y.update(x) + else: + y["backbone_out"] = x + final_name = "backbone_out" + if self.use_neck: + x = self.neck(x) + if isinstance(x, dict): + y.update(x) + else: + y["neck_out"] = x + final_name = "neck_out" + if self.use_head: + x = self.head(x) + # for multi head, save ctc neck out for udml + if isinstance(x, dict) and "ctc_nect" in x.keys(): + y["neck_out"] = x["ctc_neck"] + y["head_out"] = x + elif isinstance(x, dict): + y.update(x) + else: + y["head_out"] = x + if self.return_all_feats: + if self.training: + return y + elif isinstance(x, dict): + return x + else: + return {final_name: x} + else: + return x diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e4715ee7a7da16f90056381e1ff6f9112b68cf1c --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py @@ -0,0 +1,65 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["build_backbone"] + + +def build_backbone(config, model_type): + if model_type == "det": + from .det_mobilenet_v3 import MobileNetV3 + from .rec_hgnet import PPHGNet_small + from .rec_lcnetv3 import PPLCNetV3 + from .rec_pphgnetv2 import PPHGNetV2_B4 + + support_dict = [ + "MobileNetV3", + "ResNet", + "ResNet_vd", + "ResNet_SAST", + "PPLCNetV3", + "PPHGNet_small", + 'PPHGNetV2_B4', + ] + elif model_type == "rec" or model_type == "cls": + from .rec_hgnet import PPHGNet_small + from .rec_lcnetv3 import PPLCNetV3 + from .rec_mobilenet_v3 import MobileNetV3 + from .rec_svtrnet import SVTRNet + from .rec_mv1_enhance import MobileNetV1Enhance + from .rec_pphgnetv2 import PPHGNetV2_B4 + support_dict = [ + "MobileNetV1Enhance", + "MobileNetV3", + "ResNet", + "ResNetFPN", + "MTB", + "ResNet31", + "SVTRNet", + "ViTSTR", + "DenseNet", + "PPLCNetV3", + "PPHGNet_small", + "PPHGNetV2_B4", + ] + else: + raise NotImplementedError + + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "when model typs is {}, backbone only support {}".format( + model_type, support_dict + ) + ) + module_class = eval(module_name)(**config) + return module_class diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..03511599a0fb6d0d18940e9cd2fef19d217ec6ea --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py @@ -0,0 +1,269 @@ +from torch import nn + +from ..common import Activation + + +def make_divisible(v, divisor=8, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class ConvBNLayer(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + groups=1, + if_act=True, + act=None, + name=None, + ): + super(ConvBNLayer, self).__init__() + self.if_act = if_act + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + ) + + self.bn = nn.BatchNorm2d( + out_channels, + ) + if self.if_act: + self.act = Activation(act_type=act, inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.if_act: + x = self.act(x) + return x + + +class SEModule(nn.Module): + def __init__(self, in_channels, reduction=4, name=""): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels=in_channels, + out_channels=in_channels // reduction, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + self.relu1 = Activation(act_type="relu", inplace=True) + self.conv2 = nn.Conv2d( + in_channels=in_channels // reduction, + out_channels=in_channels, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ) + self.hard_sigmoid = Activation(act_type="hard_sigmoid", inplace=True) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = self.relu1(outputs) + outputs = self.conv2(outputs) + outputs = self.hard_sigmoid(outputs) + outputs = inputs * outputs + return outputs + + +class ResidualUnit(nn.Module): + def __init__( + self, + in_channels, + mid_channels, + out_channels, + kernel_size, + stride, + use_se, + act=None, + name="", + ): + super(ResidualUnit, self).__init__() + self.if_shortcut = stride == 1 and in_channels == out_channels + self.if_se = use_se + + self.expand_conv = ConvBNLayer( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=True, + act=act, + name=name + "_expand", + ) + self.bottleneck_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=int((kernel_size - 1) // 2), + groups=mid_channels, + if_act=True, + act=act, + name=name + "_depthwise", + ) + if self.if_se: + self.mid_se = SEModule(mid_channels, name=name + "_se") + self.linear_conv = ConvBNLayer( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + if_act=False, + act=None, + name=name + "_linear", + ) + + def forward(self, inputs): + x = self.expand_conv(inputs) + x = self.bottleneck_conv(x) + if self.if_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.if_shortcut: + x = inputs + x + return x + + +class MobileNetV3(nn.Module): + def __init__( + self, in_channels=3, model_name="large", scale=0.5, disable_se=False, **kwargs + ): + """ + the MobilenetV3 backbone network for detection module. + Args: + params(dict): the super parameters for build network + """ + super(MobileNetV3, self).__init__() + + self.disable_se = disable_se + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, "relu", 1], + [3, 64, 24, False, "relu", 2], + [3, 72, 24, False, "relu", 1], + [5, 72, 40, True, "relu", 2], + [5, 120, 40, True, "relu", 1], + [5, 120, 40, True, "relu", 1], + [3, 240, 80, False, "hard_swish", 2], + [3, 200, 80, False, "hard_swish", 1], + [3, 184, 80, False, "hard_swish", 1], + [3, 184, 80, False, "hard_swish", 1], + [3, 480, 112, True, "hard_swish", 1], + [3, 672, 112, True, "hard_swish", 1], + [5, 672, 160, True, "hard_swish", 2], + [5, 960, 160, True, "hard_swish", 1], + [5, 960, 160, True, "hard_swish", 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, "relu", 2], + [3, 72, 24, False, "relu", 2], + [3, 88, 24, False, "relu", 1], + [5, 96, 40, True, "hard_swish", 2], + [5, 240, 40, True, "hard_swish", 1], + [5, 240, 40, True, "hard_swish", 1], + [5, 120, 48, True, "hard_swish", 1], + [5, 144, 48, True, "hard_swish", 1], + [5, 288, 96, True, "hard_swish", 2], + [5, 576, 96, True, "hard_swish", 1], + [5, 576, 96, True, "hard_swish", 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError( + "mode[" + model_name + "_model] is not implemented!" + ) + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert ( + scale in supported_scale + ), "supported scale are {} but input scale is {}".format(supported_scale, scale) + inplanes = 16 + # conv1 + self.conv = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act="hard_swish", + name="conv1", + ) + + self.stages = nn.ModuleList() + self.out_channels = [] + block_list = [] + i = 0 + inplanes = make_divisible(inplanes * scale) + for k, exp, c, se, nl, s in cfg: + se = se and not self.disable_se + if s == 2 and i > 2: + self.out_channels.append(inplanes) + self.stages.append(nn.Sequential(*block_list)) + block_list = [] + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl, + name="conv" + str(i + 2), + ) + ) + inplanes = make_divisible(scale * c) + i += 1 + block_list.append( + ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act="hard_swish", + name="conv_last", + ) + ) + self.stages.append(nn.Sequential(*block_list)) + self.out_channels.append(make_divisible(scale * cls_ch_squeeze)) + # for i, stage in enumerate(self.stages): + # self.add_sublayer(sublayer=stage, name="stage{}".format(i)) + + def forward(self, x): + x = self.conv(x) + out_list = [] + for stage in self.stages: + x = stage(x) + out_list.append(x) + return out_list diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_donut_swin.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_donut_swin.py new file mode 100644 index 0000000000000000000000000000000000000000..73bb3950786366a897a4a891f7547deda167ed82 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_donut_swin.py @@ -0,0 +1,1277 @@ +import collections.abc +from collections import OrderedDict +import math +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class DonutSwinConfig(object): + model_type = "donut-swin" + + attribute_map = { + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + } + + def __init__( + self, + image_size=224, + patch_size=4, + num_channels=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + drop_path_rate=0.1, + hidden_act="gelu", + use_absolute_embeddings=False, + initializer_range=0.02, + layer_norm_eps=1e-5, + **kwargs, + ): + super().__init__() + + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.embed_dim = embed_dim + self.depths = depths + self.num_layers = len(depths) + self.num_heads = num_heads + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.drop_path_rate = drop_path_rate + self.hidden_act = hidden_act + self.use_absolute_embeddings = use_absolute_embeddings + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1)) + + for key, value in kwargs.items(): + try: + setattr(self, key, value) + except AttributeError as err: + print(f"Can't set {key} with value {value} for {self}") + raise err + + +@dataclass +# Copied from transformers.models.swin.modeling_swin.SwinEncoderOutput with Swin->DonutSwin +class DonutSwinEncoderOutput(OrderedDict): + last_hidden_state = None + hidden_states = None + attentions = None + reshaped_hidden_states = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def __getitem__(self, k): + if isinstance(k, str): + inner_dict = dict(self.items()) + return inner_dict[k] + else: + return self.to_tuple()[k] + + def __setattr__(self, name, value): + if name in self.keys() and value is not None: + super().__setitem__(name, value) + super().__setattr__(name, value) + + def __setitem__(self, key, value): + super().__setitem__(key, value) + super().__setattr__(key, value) + + def to_tuple(self): + """ + Convert self to a tuple containing all the attributes/keys that are not `None`. + """ + return tuple(self[k] for k in self.keys()) + + +@dataclass +# Copied from transformers.models.swin.modeling_swin.SwinModelOutput with Swin->DonutSwin +class DonutSwinModelOutput(OrderedDict): + last_hidden_state = None + pooler_output = None + hidden_states = None + attentions = None + reshaped_hidden_states = None + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def __getitem__(self, k): + if isinstance(k, str): + inner_dict = dict(self.items()) + return inner_dict[k] + else: + return self.to_tuple()[k] + + def __setattr__(self, name, value): + if name in self.keys() and value is not None: + super().__setitem__(name, value) + super().__setattr__(name, value) + + def __setitem__(self, key, value): + super().__setitem__(key, value) + super().__setattr__(key, value) + + def to_tuple(self): + """ + Convert self to a tuple containing all the attributes/keys that are not `None`. + """ + return tuple(self[k] for k in self.keys()) + + +# Copied from transformers.models.swin.modeling_swin.window_partition +def window_partition(input_feature, window_size): + """ + Partitions the given input into windows. + """ + batch_size, height, width, num_channels = input_feature.shape + input_feature = input_feature.reshape( + [ + batch_size, + height // window_size, + window_size, + width // window_size, + window_size, + num_channels, + ] + ) + windows = input_feature.transpose([0, 1, 3, 2, 4, 5]).reshape( + [-1, window_size, window_size, num_channels] + ) + return windows + + +# Copied from transformers.models.swin.modeling_swin.window_reverse +def window_reverse(windows, window_size, height, width): + """ + Merges windows to produce higher resolution features. + """ + num_channels = windows.shape[-1] + windows = windows.reshape( + [ + -1, + height // window_size, + width // window_size, + window_size, + window_size, + num_channels, + ] + ) + windows = windows.transpose([0, 1, 3, 2, 4, 5]).reshape( + [-1, height, width, num_channels] + ) + return windows + + +# Copied from transformers.models.swin.modeling_swin.SwinEmbeddings with Swin->DonutSwin +class DonutSwinEmbeddings(nn.Module): + """ + Construct the patch and position embeddings. Optionally, also the mask token. + """ + + def __init__(self, config, use_mask_token=False): + super().__init__() + + self.patch_embeddings = DonutSwinPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.patch_grid = self.patch_embeddings.grid_size + if use_mask_token: + # self.mask_token = paddle.create_parameter( + # [1, 1, config.embed_dim], dtype="float32" + # ) + self.mask_token = nn.Parameter( + nn.init.xavier_uniform_(torch.zeros(1, 1, config.embed_dim).to(torch.float32)) + ) + nn.init.zeros_(self.mask_token) + else: + self.mask_token = None + if config.use_absolute_embeddings: + # self.position_embeddings = paddle.create_parameter( + # [1, num_patches + 1, config.embed_dim], dtype="float32" + # ) + self.position_embeddings = nn.Parameter( + nn.init.xavier_uniform_(torch.zeros(1, num_patches + 1, config.embed_dim).to(torch.float32)) + ) + nn.init.zeros_(self.position_embedding) + else: + self.position_embeddings = None + + self.norm = nn.LayerNorm(config.embed_dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, pixel_values, bool_masked_pos=None): + + embeddings, output_dimensions = self.patch_embeddings(pixel_values) + embeddings = self.norm(embeddings) + + batch_size, seq_len, _ = embeddings.shape + + if bool_masked_pos is not None: + mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + if self.position_embeddings is not None: + embeddings = embeddings + self.position_embeddings + embeddings = self.dropout(embeddings) + return embeddings, output_dimensions + + +class MyConv2d(nn.Conv2d): + def __init__( + self, + in_channel, + out_channels, + kernel_size, + stride=1, + padding="SAME", + dilation=1, + groups=1, + bias_attr=False, + eps=1e-6, + ): + super().__init__( + in_channel, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=bias_attr, + ) + # self.weight = paddle.create_parameter( + # [out_channels, in_channel, kernel_size[0], kernel_size[1]], dtype="float32" + # ) + self.weight = torch.Parameter( + nn.init.xavier_uniform_( + torch.zeros(out_channels, in_channel, kernel_size[0], kernel_size[1]).to(torch.float32) + ) + ) + # self.bias = paddle.create_parameter([out_channels], dtype="float32") + self.bias = torch.Parameter( + nn.init.xavier_uniform_( + torch.zeros(out_channels).to(torch.float32) + ) + ) + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, x): + x = F.conv2d( + x, + self.weight, + self.bias, + self._stride, + self._padding, + self._dilation, + self._groups, + ) + return x + + +# Copied from transformers.models.swin.modeling_swin.SwinPatchEmbeddings +class DonutSwinPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.embed_dim + image_size = ( + image_size + if isinstance(image_size, collections.abc.Iterable) + else (image_size, image_size) + ) + patch_size = ( + patch_size + if isinstance(patch_size, collections.abc.Iterable) + else (patch_size, patch_size) + ) + num_patches = (image_size[1] // patch_size[1]) * ( + image_size[0] // patch_size[0] + ) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + self.is_export = config.is_export + self.grid_size = ( + image_size[0] // patch_size[0], + image_size[1] // patch_size[1], + ) + self.projection = nn.Conv2D( + num_channels, hidden_size, kernel_size=patch_size, stride=patch_size + ) + + def maybe_pad(self, pixel_values, height, width): + if width % self.patch_size[1] != 0: + pad_values = (0, self.patch_size[1] - width % self.patch_size[1]) + if self.is_export: + pad_values = torch.tensor(pad_values, dtype=torch.int32) + pixel_values = nn.functional.pad(pixel_values, pad_values) + if height % self.patch_size[0] != 0: + pad_values = (0, 0, 0, self.patch_size[0] - height % self.patch_size[0]) + if self.is_export: + pad_values = torch.tensor(pad_values, dtype=torch.int32) + pixel_values = nn.functional.pad(pixel_values, pad_values) + return pixel_values + + def forward(self, pixel_values) -> Tuple[torch.Tensor, Tuple[int]]: + _, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + pixel_values = self.maybe_pad(pixel_values, height, width) + embeddings = self.projection(pixel_values) + + _, _, height, width = embeddings.shape + output_dimensions = (height, width) + embeddings = embeddings.flatten(2).transpose([0, 2, 1]) + + return embeddings, output_dimensions + + +# Copied from transformers.models.swin.modeling_swin.SwinPatchMerging +class DonutSwinPatchMerging(nn.Module): + """ + Patch Merging Layer. + + Args: + input_resolution (`Tuple[int]`): + Resolution of input feature. + dim (`int`): + Number of input channels. + norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`): + Normalization layer class. + """ + + def __init__( + self, + input_resolution: Tuple[int], + dim: int, + norm_layer: nn.Module = nn.LayerNorm, + is_export=False, + ): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False) + self.norm = norm_layer(4 * dim) + self.is_export = is_export + + def maybe_pad(self, input_feature, height, width): + should_pad = (height % 2 == 1) or (width % 2 == 1) + if should_pad: + pad_values = (0, 0, 0, width % 2, 0, height % 2) + if self.is_export: + pad_values = torch.tensor(pad_values, dtype=torch.int32) + input_feature = nn.functional.pad(input_feature, pad_values) + + return input_feature + + def forward( + self, input_feature: torch.Tensor, input_dimensions: Tuple[int, int] + ) -> torch.Tensor: + height, width = input_dimensions + batch_size, dim, num_channels = input_feature.shape + + input_feature = input_feature.reshape([batch_size, height, width, num_channels]) + + input_feature = self.maybe_pad(input_feature, height, width) + input_feature_0 = input_feature[:, 0::2, 0::2, :] + input_feature_1 = input_feature[:, 1::2, 0::2, :] + input_feature_2 = input_feature[:, 0::2, 1::2, :] + input_feature_3 = input_feature[:, 1::2, 1::2, :] + input_feature = torch.cat( + [input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1 + ) + input_feature = input_feature.reshape( + [batch_size, -1, 4 * num_channels] + ) # batch_size height/2*width/2 4*C + + input_feature = self.norm(input_feature) + input_feature = self.reduction(input_feature) + + return input_feature + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path( + input: torch.Tensor, drop_prob: float = 0.0, training: bool = False +) -> torch.Tensor: + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * ( + input.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand( + shape, + dtype=input.dtype, + ) + random_tensor.floor_() # binarize + output = input / keep_prob * random_tensor + return output + + +# Copied from transformers.models.swin.modeling_swin.SwinDropPath +class DonutSwinDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class DonutSwinSelfAttention(nn.Module): + def __init__(self, config, dim, num_heads, window_size): + super().__init__() + if dim % num_heads != 0: + raise ValueError( + f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})" + ) + + self.num_attention_heads = num_heads + self.attention_head_size = int(dim / num_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.window_size = ( + window_size + if isinstance(window_size, collections.abc.Iterable) + else (window_size, window_size) + ) + # self.relative_position_bias_table = paddle.create_parameter( + # [(2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads], + # dtype="float32", + # ) + self.relative_position_bias_table = torch.Parameter( + nn.init.xavier_normal_( + torch.zeros((2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads).to(torch.float32) + ) + ) + + nn.init.zeros_(self.relative_position_bias_table) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij")) + coords_flatten = torch.flatten(coords, 1) + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] + relative_coords = relative_coords.transpose([1, 2, 0]) + relative_coords[:, :, 0] += self.window_size[0] - 1 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) + self.register_buffer("relative_position_index", relative_position_index) + + self.query = nn.Linear( + self.all_head_size, self.all_head_size, bias_attr=config.qkv_bias + ) + self.key = nn.Linear( + self.all_head_size, self.all_head_size, bias_attr=config.qkv_bias + ) + self.value = nn.Linear( + self.all_head_size, self.all_head_size, bias_attr=config.qkv_bias + ) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.shape[:-1] + [ + self.num_attention_heads, + self.attention_head_size, + ] + x = x.reshape(new_x_shape) + return x.transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask=None, + head_mask=None, + output_attentions=False, + ) -> Tuple[torch.Tensor]: + batch_size, dim, num_channels = hidden_states.shape + mixed_query_layer = self.query(hidden_states) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose([0, 1, 3, 2])) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.reshape([-1]) + ] + relative_position_bias = relative_position_bias.reshape( + [ + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1, + ] + ) + + relative_position_bias = relative_position_bias.transpose([2, 0, 1]) + attention_scores = attention_scores + relative_position_bias.unsqueeze(0) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in DonutSwinModel forward() function) + mask_shape = attention_mask.shape[0] + attention_scores = attention_scores.reshape( + [ + batch_size // mask_shape, + mask_shape, + self.num_attention_heads, + dim, + dim, + ] + ) + attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze( + 0 + ) + attention_scores = attention_scores.reshape( + [-1, self.num_attention_heads, dim, dim] + ) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.transpose([0, 2, 1, 3]) + new_context_layer_shape = tuple(context_layer.shape[:-2]) + ( + self.all_head_size, + ) + context_layer = context_layer.reshape(new_context_layer_shape) + outputs = ( + (context_layer, attention_probs) if output_attentions else (context_layer,) + ) + return outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinSelfOutput +class DonutSwinSelfOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, dim) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def forward( + self, hidden_states: torch.Tensor, input_tensor: torch.Tensor + ) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinAttention with Swin->DonutSwin +class DonutSwinAttention(nn.Module): + def __init__(self, config, dim, num_heads, window_size): + super().__init__() + self.self = DonutSwinSelfAttention(config, dim, num_heads, window_size) + self.output = DonutSwinSelfOutput(config, dim) + self.pruned_heads = set() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask=None, + head_mask=None, + output_attentions=False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, attention_mask, head_mask, output_attentions + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[ + 1: + ] # add attentions if we output them + return outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinIntermediate +class DonutSwinIntermediate(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(dim, int(config.mlp_ratio * dim)) + self.intermediate_act_fn = F.gelu + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinOutput +class DonutSwinOutput(nn.Module): + def __init__(self, config, dim): + super().__init__() + self.dense = nn.Linear(int(config.mlp_ratio * dim), dim) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +# Copied from transformers.models.swin.modeling_swin.SwinLayer with Swin->DonutSwin +class DonutSwinLayer(nn.Module): + def __init__(self, config, dim, input_resolution, num_heads, shift_size=0): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.shift_size = shift_size + self.window_size = config.window_size + self.input_resolution = input_resolution + self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.attention = DonutSwinAttention( + config, dim, num_heads, window_size=self.window_size + ) + self.drop_path = ( + DonutSwinDropPath(config.drop_path_rate) + if config.drop_path_rate > 0.0 + else nn.Identity() + ) + self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps) + self.intermediate = DonutSwinIntermediate(config, dim) + self.output = DonutSwinOutput(config, dim) + self.is_export = config.is_export + + def set_shift_and_window_size(self, input_resolution): + if min(input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(input_resolution) + + def get_attn_mask_export(self, height, width, dtype): + + attn_mask = None + height_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + width_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + img_mask = torch.zeros((1, height, width, 1), dtype=dtype) + count = 0 + for height_slice in height_slices: + for width_slice in width_slices: + if self.shift_size > 0: + img_mask[:, height_slice, width_slice, :] = count + count += 1 + if torch.Tensor(self.shift_size > 0).to(torch.bool): + # calculate attention mask for SW-MSA + mask_windows = window_partition(img_mask, self.window_size) + mask_windows = mask_windows.reshape( + [-1, self.window_size * self.window_size] + ) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill( + attn_mask != 0, float(-100.0) + ).masked_fill(attn_mask == 0, float(0.0)) + + return attn_mask + + def get_attn_mask(self, height, width, dtype): + if self.shift_size > 0: + # calculate attention mask for SW-MSA + img_mask = torch.zeros((1, height, width, 1), dtype=dtype) + height_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + width_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + + count = 0 + for height_slice in height_slices: + for width_slice in width_slices: + img_mask[:, height_slice, width_slice, :] = count + count += 1 + + mask_windows = window_partition(img_mask, self.window_size) + mask_windows = mask_windows.reshape( + [-1, self.window_size * self.window_size] + ) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill( + attn_mask != 0, float(-100.0) + ).masked_fill(attn_mask == 0, float(0.0)) + else: + attn_mask = None + return attn_mask + + def maybe_pad(self, hidden_states, height, width): + pad_right = (self.window_size - width % self.window_size) % self.window_size + pad_bottom = (self.window_size - height % self.window_size) % self.window_size + pad_values = (0, 0, 0, pad_bottom, 0, pad_right, 0, 0) + hidden_states = nn.functional.pad(hidden_states, pad_values) + return hidden_states, pad_values + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: Tuple[int, int], + head_mask=None, + output_attentions=False, + always_partition=False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + if not always_partition: + self.set_shift_and_window_size(input_dimensions) + else: + pass + height, width = input_dimensions + batch_size, _, channels = hidden_states.shape + shortcut = hidden_states + + hidden_states = self.layernorm_before(hidden_states) + + hidden_states = hidden_states.reshape([batch_size, height, width, channels]) + + # pad hidden_states to multiples of window size + hidden_states, pad_values = self.maybe_pad(hidden_states, height, width) + + _, height_pad, width_pad, _ = hidden_states.shape + + # cyclic shift + if self.shift_size > 0: + shift_value = (-self.shift_size, -self.shift_size) + if self.is_export: + shift_value = torch.tensor(shift_value, dtype=torch.int32) + shifted_hidden_states = torch.roll( + hidden_states, shifts=shift_value, dims=(1, 2) + ) + else: + shifted_hidden_states = hidden_states + + # partition windows + hidden_states_windows = window_partition( + shifted_hidden_states, self.window_size + ) + hidden_states_windows = hidden_states_windows.reshape( + [-1, self.window_size * self.window_size, channels] + ) + attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype) + + attention_outputs = self.attention( + hidden_states_windows, + attn_mask, + head_mask, + output_attentions=output_attentions, + ) + attention_output = attention_outputs[0] + + attention_windows = attention_output.reshape( + [-1, self.window_size, self.window_size, channels] + ) + shifted_windows = window_reverse( + attention_windows, self.window_size, height_pad, width_pad + ) + # reverse cyclic shift + if self.shift_size > 0: + shift_value = (self.shift_size, self.shift_size) + if self.is_export: + shift_value = torch.tensor(shift_value, dtype=torch.int32) + attention_windows = torch.roll( + shifted_windows, shifts=shift_value, dims=(1, 2) + ) + else: + attention_windows = shifted_windows + + was_padded = pad_values[3] > 0 or pad_values[5] > 0 + if was_padded: + attention_windows = attention_windows[:, :height, :width, :].contiguous() + + attention_windows = attention_windows.reshape( + [batch_size, height * width, channels] + ) + hidden_states = shortcut + self.drop_path(attention_windows) + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + layer_output = hidden_states + self.output(layer_output) + layer_outputs = ( + (layer_output, attention_outputs[1]) + if output_attentions + else (layer_output,) + ) + return layer_outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinStage with Swin->DonutSwin +class DonutSwinStage(nn.Module): + def __init__( + self, config, dim, input_resolution, depth, num_heads, drop_path, downsample + ): + super().__init__() + self.config = config + self.dim = dim + self.blocks = nn.ModuleList( + [ + DonutSwinLayer( + config=config, + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + shift_size=0 if (i % 2 == 0) else config.window_size // 2, + ) + for i in range(depth) + ] + ) + self.is_export = config.is_export + + # patch merging layer + if downsample is not None: + self.downsample = downsample( + input_resolution, + dim=dim, + norm_layer=nn.LayerNorm, + is_export=self.is_export, + ) + else: + self.downsample = None + + self.pointing = False + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: Tuple[int, int], + head_mask=None, + output_attentions=False, + always_partition=False, + ) -> Tuple[torch.Tensor]: + height, width = input_dimensions + + for i, layer_module in enumerate(self.blocks): + layer_head_mask = head_mask[i] if head_mask is not None else None + + layer_outputs = layer_module( + hidden_states, + input_dimensions, + layer_head_mask, + output_attentions, + always_partition, + ) + + hidden_states = layer_outputs[0] + + hidden_states_before_downsampling = hidden_states + if self.downsample is not None: + height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2 + output_dimensions = (height, width, height_downsampled, width_downsampled) + hidden_states = self.downsample( + hidden_states_before_downsampling, input_dimensions + ) + else: + output_dimensions = (height, width, height, width) + + stage_outputs = ( + hidden_states, + hidden_states_before_downsampling, + output_dimensions, + ) + + if output_attentions: + stage_outputs += layer_outputs[1:] + return stage_outputs + + +# Copied from transformers.models.swin.modeling_swin.SwinEncoder with Swin->DonutSwin +class DonutSwinEncoder(nn.Module): + def __init__(self, config, grid_size): + super().__init__() + self.num_layers = len(config.depths) + self.config = config + dpr = [ + x.item() + for x in torch.linspace(0, config.drop_path_rate, sum(config.depths)) + ] + self.layers = nn.ModuleList( + [ + DonutSwinStage( + config=config, + dim=int(config.embed_dim * 2**i_layer), + input_resolution=( + grid_size[0] // (2**i_layer), + grid_size[1] // (2**i_layer), + ), + depth=config.depths[i_layer], + num_heads=config.num_heads[i_layer], + drop_path=dpr[ + sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1]) + ], + downsample=( + DonutSwinPatchMerging + if (i_layer < self.num_layers - 1) + else None + ), + ) + for i_layer in range(self.num_layers) + ] + ) + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + input_dimensions: Tuple[int, int], + head_mask=None, + output_attentions=False, + output_hidden_states=False, + output_hidden_states_before_downsampling=False, + always_partition=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_reshaped_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if output_hidden_states: + batch_size, _, hidden_size = hidden_states.shape + reshaped_hidden_state = hidden_states.view( + batch_size, *input_dimensions, hidden_size + ) + reshaped_hidden_state = reshaped_hidden_state.permute(0, 3, 1, 2) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + for i, layer_module in enumerate(self.layers): + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + input_dimensions, + layer_head_mask, + output_attentions, + always_partition, + ) + else: + layer_outputs = layer_module( + hidden_states, + input_dimensions, + layer_head_mask, + output_attentions, + always_partition, + ) + + hidden_states = layer_outputs[0] + + hidden_states_before_downsampling = layer_outputs[1] + output_dimensions = layer_outputs[2] + + input_dimensions = (output_dimensions[-2], output_dimensions[-1]) + + if output_hidden_states and output_hidden_states_before_downsampling: + batch_size, _, hidden_size = hidden_states_before_downsampling.shape + reshaped_hidden_state = hidden_states_before_downsampling.reshape( + [ + batch_size, + *(output_dimensions[0], output_dimensions[1]), + hidden_size, + ] + ) + reshaped_hidden_state = reshaped_hidden_state.transpose([0, 3, 1, 2]) + all_hidden_states += (hidden_states_before_downsampling,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + elif output_hidden_states and not output_hidden_states_before_downsampling: + batch_size, _, hidden_size = hidden_states.shape + reshaped_hidden_state = hidden_states.reshape( + [batch_size, *input_dimensions, hidden_size] + ) + reshaped_hidden_state = reshaped_hidden_state.transpose([0, 3, 1, 2]) + all_hidden_states += (hidden_states,) + all_reshaped_hidden_states += (reshaped_hidden_state,) + + if output_attentions: + all_self_attentions += layer_outputs[3:] + + if not return_dict: + return tuple( + v + for v in [hidden_states, all_hidden_states, all_self_attentions] + if v is not None + ) + + return DonutSwinEncoderOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + reshaped_hidden_states=all_reshaped_hidden_states, + ) + + +class DonutSwinPreTrainedModel(nn.Module): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DonutSwinConfig + base_model_prefix = "swin" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2D)): + # normal_ = Normal(mean=0.0, std=self.config.initializer_range) + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + nn.init.zeros_(module.bias) + nn.init.ones_(module.weight) + + def _initialize_weights(self, module): + """ + Initialize the weights if they are not already initialized. + """ + if getattr(module, "_is_hf_initialized", False): + return + self._init_weights(module) + + def post_init(self): + self.apply(self._initialize_weights) + + def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False): + if head_mask is not None: + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) + if is_attention_chunked is True: + head_mask = head_mask.unsqueeze(-1) + else: + head_mask = [None] * num_hidden_layers + + return head_mask + + +class DonutSwinModel(DonutSwinPreTrainedModel): + def __init__( + self, + in_channels=3, + hidden_size=1024, + num_layers=4, + num_heads=[4, 8, 16, 32], + add_pooling_layer=True, + use_mask_token=False, + is_export=False, + ): + super().__init__() + donut_swin_config = { + "return_dict": True, + "output_hidden_states": False, + "output_attentions": False, + "use_bfloat16": False, + "tf_legacy_loss": False, + "pruned_heads": {}, + "tie_word_embeddings": True, + "chunk_size_feed_forward": 0, + "is_encoder_decoder": False, + "is_decoder": False, + "cross_attention_hidden_size": None, + "add_cross_attention": False, + "tie_encoder_decoder": False, + "max_length": 20, + "min_length": 0, + "do_sample": False, + "early_stopping": False, + "num_beams": 1, + "num_beam_groups": 1, + "diversity_penalty": 0.0, + "temperature": 1.0, + "top_k": 50, + "top_p": 1.0, + "typical_p": 1.0, + "repetition_penalty": 1.0, + "length_penalty": 1.0, + "no_repeat_ngram_size": 0, + "encoder_no_repeat_ngram_size": 0, + "bad_words_ids": None, + "num_return_sequences": 1, + "output_scores": False, + "return_dict_in_generate": False, + "forced_bos_token_id": None, + "forced_eos_token_id": None, + "remove_invalid_values": False, + "exponential_decay_length_penalty": None, + "suppress_tokens": None, + "begin_suppress_tokens": None, + "architectures": None, + "finetuning_task": None, + "id2label": {0: "LABEL_0", 1: "LABEL_1"}, + "label2id": {"LABEL_0": 0, "LABEL_1": 1}, + "tokenizer_class": None, + "prefix": None, + "bos_token_id": None, + "pad_token_id": None, + "eos_token_id": None, + "sep_token_id": None, + "decoder_start_token_id": None, + "task_specific_params": None, + "problem_type": None, + "_name_or_path": "", + "_commit_hash": None, + "_attn_implementation_internal": None, + "transformers_version": None, + "hidden_size": hidden_size, + "num_layers": num_layers, + "path_norm": True, + "use_2d_embeddings": False, + "image_size": [420, 420], + "patch_size": 4, + "num_channels": in_channels, + "embed_dim": 128, + "depths": [2, 2, 14, 2], + "num_heads": num_heads, + "window_size": 5, + "mlp_ratio": 4.0, + "qkv_bias": True, + "hidden_dropout_prob": 0.0, + "attention_probs_dropout_prob": 0.0, + "drop_path_rate": 0.1, + "hidden_act": "gelu", + "use_absolute_embeddings": False, + "layer_norm_eps": 1e-05, + "initializer_range": 0.02, + "is_export": is_export, + } + + config = DonutSwinConfig(**donut_swin_config) + self.config = config + self.num_layers = len(config.depths) + self.num_features = int(config.embed_dim * 2 ** (self.num_layers - 1)) + + self.embeddings = DonutSwinEmbeddings(config, use_mask_token=use_mask_token) + self.encoder = DonutSwinEncoder(config, self.embeddings.patch_grid) + + self.pooler = nn.AdaptiveAvgPool1D(1) if add_pooling_layer else None + self.out_channels = hidden_size + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def forward( + self, + input_data=None, + bool_masked_pos=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ) -> Union[Tuple, DonutSwinModelOutput]: + r""" + bool_masked_pos (`paddle.BoolTensor` of shape `(batch_size, num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + if self.training: + pixel_values, label, attention_mask = input_data + else: + if isinstance(input_data, list): + pixel_values = input_data[0] + else: + pixel_values = input_data + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.return_dict + ) + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + num_channels = pixel_values.shape[1] + if num_channels == 1: + pixel_values = torch.repeat_interleave(pixel_values, repeats=3, dim=1) + + head_mask = self.get_head_mask(head_mask, len(self.config.depths)) + + embedding_output, input_dimensions = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos + ) + + encoder_outputs = self.encoder( + embedding_output, + input_dimensions, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + pooled_output = None + if self.pooler is not None: + pooled_output = self.pooler(sequence_output.transpose([0, 2, 1])) + pooled_output = torch.flatten(pooled_output, 1) + + if not return_dict: + output = (sequence_output, pooled_output) + encoder_outputs[1:] + return output + + donut_swin_output = DonutSwinModelOutput( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + reshaped_hidden_states=encoder_outputs.reshaped_hidden_states, + ) + if self.training: + return donut_swin_output, label, attention_mask + else: + return donut_swin_output \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py new file mode 100644 index 0000000000000000000000000000000000000000..c1515a712a10c3c925d54d53a99c0f7e67453c9f --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py @@ -0,0 +1,290 @@ +import torch +import torch.nn.functional as F +from torch import nn + + +class ConvBNAct(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, stride, groups=1, use_act=True + ): + super().__init__() + self.use_act = use_act + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False, + ) + self.bn = nn.BatchNorm2d(out_channels) + if self.use_act: + self.act = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.use_act: + x = self.act(x) + return x + + +class ESEModule(nn.Module): + def __init__(self, channels): + super().__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv = nn.Conv2d( + in_channels=channels, + out_channels=channels, + kernel_size=1, + stride=1, + padding=0, + ) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv(x) + x = self.sigmoid(x) + return x * identity + + +class HG_Block(nn.Module): + def __init__( + self, + in_channels, + mid_channels, + out_channels, + layer_num, + identity=False, + ): + super().__init__() + self.identity = identity + + self.layers = nn.ModuleList() + self.layers.append( + ConvBNAct( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=3, + stride=1, + ) + ) + for _ in range(layer_num - 1): + self.layers.append( + ConvBNAct( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=3, + stride=1, + ) + ) + + # feature aggregation + total_channels = in_channels + layer_num * mid_channels + self.aggregation_conv = ConvBNAct( + in_channels=total_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + ) + self.att = ESEModule(out_channels) + + def forward(self, x): + identity = x + output = [] + output.append(x) + for layer in self.layers: + x = layer(x) + output.append(x) + x = torch.cat(output, dim=1) + x = self.aggregation_conv(x) + x = self.att(x) + if self.identity: + x += identity + return x + + +class HG_Stage(nn.Module): + def __init__( + self, + in_channels, + mid_channels, + out_channels, + block_num, + layer_num, + downsample=True, + stride=[2, 1], + ): + super().__init__() + self.downsample = downsample + if downsample: + self.downsample = ConvBNAct( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + stride=stride, + groups=in_channels, + use_act=False, + ) + + blocks_list = [] + blocks_list.append( + HG_Block(in_channels, mid_channels, out_channels, layer_num, identity=False) + ) + for _ in range(block_num - 1): + blocks_list.append( + HG_Block( + out_channels, mid_channels, out_channels, layer_num, identity=True + ) + ) + self.blocks = nn.Sequential(*blocks_list) + + def forward(self, x): + if self.downsample: + x = self.downsample(x) + x = self.blocks(x) + return x + + +class PPHGNet(nn.Module): + """ + PPHGNet + Args: + stem_channels: list. Stem channel list of PPHGNet. + stage_config: dict. The configuration of each stage of PPHGNet. such as the number of channels, stride, etc. + layer_num: int. Number of layers of HG_Block. + use_last_conv: boolean. Whether to use a 1x1 convolutional layer before the classification layer. + class_expand: int=2048. Number of channels for the last 1x1 convolutional layer. + dropout_prob: float. Parameters of dropout, 0.0 means dropout is not used. + class_num: int=1000. The number of classes. + Returns: + model: nn.Layer. Specific PPHGNet model depends on args. + """ + + def __init__( + self, + stem_channels, + stage_config, + layer_num, + in_channels=3, + det=False, + out_indices=None, + ): + super().__init__() + self.det = det + self.out_indices = out_indices if out_indices is not None else [0, 1, 2, 3] + + # stem + stem_channels.insert(0, in_channels) + self.stem = nn.Sequential( + *[ + ConvBNAct( + in_channels=stem_channels[i], + out_channels=stem_channels[i + 1], + kernel_size=3, + stride=2 if i == 0 else 1, + ) + for i in range(len(stem_channels) - 1) + ] + ) + + if self.det: + self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + # stages + self.stages = nn.ModuleList() + self.out_channels = [] + for block_id, k in enumerate(stage_config): + ( + in_channels, + mid_channels, + out_channels, + block_num, + downsample, + stride, + ) = stage_config[k] + self.stages.append( + HG_Stage( + in_channels, + mid_channels, + out_channels, + block_num, + layer_num, + downsample, + stride, + ) + ) + if block_id in self.out_indices: + self.out_channels.append(out_channels) + + if not self.det: + self.out_channels = stage_config["stage4"][2] + + self._init_weights() + + def _init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.zeros_(m.bias) + + def forward(self, x): + x = self.stem(x) + if self.det: + x = self.pool(x) + + out = [] + for i, stage in enumerate(self.stages): + x = stage(x) + if self.det and i in self.out_indices: + out.append(x) + if self.det: + return out + + if self.training: + x = F.adaptive_avg_pool2d(x, [1, 40]) + else: + x = F.avg_pool2d(x, [3, 2]) + return x + + +def PPHGNet_small(pretrained=False, use_ssld=False, det=False, **kwargs): + """ + PPHGNet_small + Args: + pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True. + Returns: + model: nn.Layer. Specific `PPHGNet_small` model depends on args. + """ + stage_config_det = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [128, 128, 256, 1, False, 2], + "stage2": [256, 160, 512, 1, True, 2], + "stage3": [512, 192, 768, 2, True, 2], + "stage4": [768, 224, 1024, 1, True, 2], + } + + stage_config_rec = { + # in_channels, mid_channels, out_channels, blocks, downsample + "stage1": [128, 128, 256, 1, True, [2, 1]], + "stage2": [256, 160, 512, 1, True, [1, 2]], + "stage3": [512, 192, 768, 2, True, [2, 1]], + "stage4": [768, 224, 1024, 1, True, [2, 1]], + } + + model = PPHGNet( + stem_channels=[64, 64, 128], + stage_config=stage_config_det if det else stage_config_rec, + layer_num=6, + det=det, + **kwargs + ) + return model diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py new file mode 100644 index 0000000000000000000000000000000000000000..e2bd4572a767560c0a0250aec64fae0c9bdaee2c --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py @@ -0,0 +1,516 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import, division, print_function + +import torch +import torch.nn.functional as F +from torch import nn + +from ..common import Activation + +NET_CONFIG_det = { + "blocks2": + # k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], + "blocks5": [ + [3, 128, 256, 2, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + ], + "blocks6": [ + [5, 256, 512, 2, True], + [5, 512, 512, 1, True], + [5, 512, 512, 1, False], + [5, 512, 512, 1, False], + ], +} + +NET_CONFIG_rec = { + "blocks2": + # k, in_c, out_c, s, use_se + [[3, 16, 32, 1, False]], + "blocks3": [[3, 32, 64, 1, False], [3, 64, 64, 1, False]], + "blocks4": [[3, 64, 128, (2, 1), False], [3, 128, 128, 1, False]], + "blocks5": [ + [3, 128, 256, (1, 2), False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + [5, 256, 256, 1, False], + ], + "blocks6": [ + [5, 256, 512, (2, 1), True], + [5, 512, 512, 1, True], + [5, 512, 512, (2, 1), False], + [5, 512, 512, 1, False], + ], +} + + +def make_divisible(v, divisor=16, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class LearnableAffineBlock(nn.Module): + def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.1): + super().__init__() + self.scale = nn.Parameter(torch.Tensor([scale_value])) + self.bias = nn.Parameter(torch.Tensor([bias_value])) + + def forward(self, x): + return self.scale * x + self.bias + + +class ConvBNLayer(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, stride, groups=1, lr_mult=1.0 + ): + super().__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=False, + ) + + self.bn = nn.BatchNorm2d( + out_channels, + ) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +class Act(nn.Module): + def __init__(self, act="hswish", lr_mult=1.0, lab_lr=0.1): + super().__init__() + if act == "hswish": + self.act = nn.Hardswish(inplace=True) + else: + assert act == "relu" + self.act = Activation(act) + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + return self.lab(self.act(x)) + + +class LearnableRepLayer(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + groups=1, + num_conv_branches=1, + lr_mult=1.0, + lab_lr=0.1, + ): + super().__init__() + self.is_repped = False + self.groups = groups + self.stride = stride + self.kernel_size = kernel_size + self.in_channels = in_channels + self.out_channels = out_channels + self.num_conv_branches = num_conv_branches + self.padding = (kernel_size - 1) // 2 + + self.identity = ( + nn.BatchNorm2d( + num_features=in_channels, + ) + if out_channels == in_channels and stride == 1 + else None + ) + + self.conv_kxk = nn.ModuleList( + [ + ConvBNLayer( + in_channels, + out_channels, + kernel_size, + stride, + groups=groups, + lr_mult=lr_mult, + ) + for _ in range(self.num_conv_branches) + ] + ) + + self.conv_1x1 = ( + ConvBNLayer( + in_channels, out_channels, 1, stride, groups=groups, lr_mult=lr_mult + ) + if kernel_size > 1 + else None + ) + + self.lab = LearnableAffineBlock(lr_mult=lr_mult, lab_lr=lab_lr) + self.act = Act(lr_mult=lr_mult, lab_lr=lab_lr) + + def forward(self, x): + # for export + if self.is_repped: + out = self.lab(self.reparam_conv(x)) + if self.stride != 2: + out = self.act(out) + return out + + out = 0 + if self.identity is not None: + out += self.identity(x) + + if self.conv_1x1 is not None: + out += self.conv_1x1(x) + + for conv in self.conv_kxk: + out += conv(x) + + out = self.lab(out) + if self.stride != 2: + out = self.act(out) + return out + + def rep(self): + if self.is_repped: + return + kernel, bias = self._get_kernel_bias() + self.reparam_conv = nn.Conv2d( + in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + groups=self.groups, + ) + self.reparam_conv.weight.data = kernel + self.reparam_conv.bias.data = bias + self.is_repped = True + + def _pad_kernel_1x1_to_kxk(self, kernel1x1, pad): + if not isinstance(kernel1x1, torch.Tensor): + return 0 + else: + return nn.functional.pad(kernel1x1, [pad, pad, pad, pad]) + + def _get_kernel_bias(self): + kernel_conv_1x1, bias_conv_1x1 = self._fuse_bn_tensor(self.conv_1x1) + kernel_conv_1x1 = self._pad_kernel_1x1_to_kxk( + kernel_conv_1x1, self.kernel_size // 2 + ) + + kernel_identity, bias_identity = self._fuse_bn_tensor(self.identity) + + kernel_conv_kxk = 0 + bias_conv_kxk = 0 + for conv in self.conv_kxk: + kernel, bias = self._fuse_bn_tensor(conv) + kernel_conv_kxk += kernel + bias_conv_kxk += bias + + kernel_reparam = kernel_conv_kxk + kernel_conv_1x1 + kernel_identity + bias_reparam = bias_conv_kxk + bias_conv_1x1 + bias_identity + return kernel_reparam, bias_reparam + + def _fuse_bn_tensor(self, branch): + if not branch: + return 0, 0 + elif isinstance(branch, ConvBNLayer): + kernel = branch.conv.weight + running_mean = branch.bn._mean + running_var = branch.bn._variance + gamma = branch.bn.weight + beta = branch.bn.bias + eps = branch.bn._epsilon + else: + assert isinstance(branch, nn.BatchNorm2d) + if not hasattr(self, "id_tensor"): + input_dim = self.in_channels // self.groups + kernel_value = torch.zeros( + (self.in_channels, input_dim, self.kernel_size, self.kernel_size), + dtype=branch.weight.dtype, + ) + for i in range(self.in_channels): + kernel_value[ + i, i % input_dim, self.kernel_size // 2, self.kernel_size // 2 + ] = 1 + self.id_tensor = kernel_value + kernel = self.id_tensor + running_mean = branch._mean + running_var = branch._variance + gamma = branch.weight + beta = branch.bias + eps = branch._epsilon + std = (running_var + eps).sqrt() + t = (gamma / std).reshape((-1, 1, 1, 1)) + return kernel * t, beta - running_mean * gamma / std + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=4, lr_mult=1.0): + super().__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + ) + self.relu = nn.ReLU() + self.conv2 = nn.Conv2d( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + ) + self.hardsigmoid = nn.Hardsigmoid(inplace=True) + + def forward(self, x): + identity = x + x = self.avg_pool(x) + x = self.conv1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.hardsigmoid(x) + x = identity * x + return x + + +class LCNetV3Block(nn.Module): + def __init__( + self, + in_channels, + out_channels, + stride, + dw_size, + use_se=False, + conv_kxk_num=4, + lr_mult=1.0, + lab_lr=0.1, + ): + super().__init__() + self.use_se = use_se + self.dw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=dw_size, + stride=stride, + groups=in_channels, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr, + ) + if use_se: + self.se = SELayer(in_channels, lr_mult=lr_mult) + self.pw_conv = LearnableRepLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + num_conv_branches=conv_kxk_num, + lr_mult=lr_mult, + lab_lr=lab_lr, + ) + + def forward(self, x): + x = self.dw_conv(x) + if self.use_se: + x = self.se(x) + x = self.pw_conv(x) + return x + + +class PPLCNetV3(nn.Module): + def __init__( + self, + scale=1.0, + conv_kxk_num=4, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + lab_lr=0.1, + det=False, + **kwargs + ): + super().__init__() + self.scale = scale + self.lr_mult_list = lr_mult_list + self.det = det + + self.net_config = NET_CONFIG_det if self.det else NET_CONFIG_rec + + assert isinstance( + self.lr_mult_list, (list, tuple) + ), "lr_mult_list should be in (list, tuple) but got {}".format( + type(self.lr_mult_list) + ) + assert ( + len(self.lr_mult_list) == 6 + ), "lr_mult_list length should be 6 but got {}".format(len(self.lr_mult_list)) + + self.conv1 = ConvBNLayer( + in_channels=3, + out_channels=make_divisible(16 * scale), + kernel_size=3, + stride=2, + lr_mult=self.lr_mult_list[0], + ) + + self.blocks2 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[1], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks2"]) + ] + ) + + self.blocks3 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[2], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks3"]) + ] + ) + + self.blocks4 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[3], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks4"]) + ] + ) + + self.blocks5 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[4], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks5"]) + ] + ) + + self.blocks6 = nn.Sequential( + *[ + LCNetV3Block( + in_channels=make_divisible(in_c * scale), + out_channels=make_divisible(out_c * scale), + dw_size=k, + stride=s, + use_se=se, + conv_kxk_num=conv_kxk_num, + lr_mult=self.lr_mult_list[5], + lab_lr=lab_lr, + ) + for i, (k, in_c, out_c, s, se) in enumerate(self.net_config["blocks6"]) + ] + ) + self.out_channels = make_divisible(512 * scale) + + if self.det: + mv_c = [16, 24, 56, 480] + self.out_channels = [ + make_divisible(self.net_config["blocks3"][-1][2] * scale), + make_divisible(self.net_config["blocks4"][-1][2] * scale), + make_divisible(self.net_config["blocks5"][-1][2] * scale), + make_divisible(self.net_config["blocks6"][-1][2] * scale), + ] + + self.layer_list = nn.ModuleList( + [ + nn.Conv2d(self.out_channels[0], int(mv_c[0] * scale), 1, 1, 0), + nn.Conv2d(self.out_channels[1], int(mv_c[1] * scale), 1, 1, 0), + nn.Conv2d(self.out_channels[2], int(mv_c[2] * scale), 1, 1, 0), + nn.Conv2d(self.out_channels[3], int(mv_c[3] * scale), 1, 1, 0), + ] + ) + self.out_channels = [ + int(mv_c[0] * scale), + int(mv_c[1] * scale), + int(mv_c[2] * scale), + int(mv_c[3] * scale), + ] + + def forward(self, x): + out_list = [] + x = self.conv1(x) + x = self.blocks2(x) + x = self.blocks3(x) + out_list.append(x) + x = self.blocks4(x) + out_list.append(x) + x = self.blocks5(x) + out_list.append(x) + x = self.blocks6(x) + out_list.append(x) + + if self.det: + out_list[0] = self.layer_list[0](out_list[0]) + out_list[1] = self.layer_list[1](out_list[1]) + out_list[2] = self.layer_list[2](out_list[2]) + out_list[3] = self.layer_list[3](out_list[3]) + return out_list + + if self.training: + x = F.adaptive_avg_pool2d(x, [1, 40]) + else: + x = F.avg_pool2d(x, [3, 2]) + return x diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..d284a6d49a2b4abfab285643aa849b9e6bf2db37 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py @@ -0,0 +1,136 @@ +from torch import nn + +from .det_mobilenet_v3 import ConvBNLayer, ResidualUnit, make_divisible + + +class MobileNetV3(nn.Module): + def __init__( + self, + in_channels=3, + model_name="small", + scale=0.5, + large_stride=None, + small_stride=None, + **kwargs + ): + super(MobileNetV3, self).__init__() + if small_stride is None: + small_stride = [2, 2, 2, 2] + if large_stride is None: + large_stride = [1, 2, 2, 2] + + assert isinstance( + large_stride, list + ), "large_stride type must " "be list but got {}".format(type(large_stride)) + assert isinstance( + small_stride, list + ), "small_stride type must " "be list but got {}".format(type(small_stride)) + assert ( + len(large_stride) == 4 + ), "large_stride length must be " "4 but got {}".format(len(large_stride)) + assert ( + len(small_stride) == 4 + ), "small_stride length must be " "4 but got {}".format(len(small_stride)) + + if model_name == "large": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, False, "relu", large_stride[0]], + [3, 64, 24, False, "relu", (large_stride[1], 1)], + [3, 72, 24, False, "relu", 1], + [5, 72, 40, True, "relu", (large_stride[2], 1)], + [5, 120, 40, True, "relu", 1], + [5, 120, 40, True, "relu", 1], + [3, 240, 80, False, "hard_swish", 1], + [3, 200, 80, False, "hard_swish", 1], + [3, 184, 80, False, "hard_swish", 1], + [3, 184, 80, False, "hard_swish", 1], + [3, 480, 112, True, "hard_swish", 1], + [3, 672, 112, True, "hard_swish", 1], + [5, 672, 160, True, "hard_swish", (large_stride[3], 1)], + [5, 960, 160, True, "hard_swish", 1], + [5, 960, 160, True, "hard_swish", 1], + ] + cls_ch_squeeze = 960 + elif model_name == "small": + cfg = [ + # k, exp, c, se, nl, s, + [3, 16, 16, True, "relu", (small_stride[0], 1)], + [3, 72, 24, False, "relu", (small_stride[1], 1)], + [3, 88, 24, False, "relu", 1], + [5, 96, 40, True, "hard_swish", (small_stride[2], 1)], + [5, 240, 40, True, "hard_swish", 1], + [5, 240, 40, True, "hard_swish", 1], + [5, 120, 48, True, "hard_swish", 1], + [5, 144, 48, True, "hard_swish", 1], + [5, 288, 96, True, "hard_swish", (small_stride[3], 1)], + [5, 576, 96, True, "hard_swish", 1], + [5, 576, 96, True, "hard_swish", 1], + ] + cls_ch_squeeze = 576 + else: + raise NotImplementedError( + "mode[" + model_name + "_model] is not implemented!" + ) + + supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25] + assert ( + scale in supported_scale + ), "supported scales are {} but input scale is {}".format( + supported_scale, scale + ) + + inplanes = 16 + # conv1 + self.conv1 = ConvBNLayer( + in_channels=in_channels, + out_channels=make_divisible(inplanes * scale), + kernel_size=3, + stride=2, + padding=1, + groups=1, + if_act=True, + act="hard_swish", + name="conv1", + ) + i = 0 + block_list = [] + inplanes = make_divisible(inplanes * scale) + for k, exp, c, se, nl, s in cfg: + block_list.append( + ResidualUnit( + in_channels=inplanes, + mid_channels=make_divisible(scale * exp), + out_channels=make_divisible(scale * c), + kernel_size=k, + stride=s, + use_se=se, + act=nl, + name="conv" + str(i + 2), + ) + ) + inplanes = make_divisible(scale * c) + i += 1 + self.blocks = nn.Sequential(*block_list) + + self.conv2 = ConvBNLayer( + in_channels=inplanes, + out_channels=make_divisible(scale * cls_ch_squeeze), + kernel_size=1, + stride=1, + padding=0, + groups=1, + if_act=True, + act="hard_swish", + name="conv_last", + ) + + self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + self.out_channels = make_divisible(scale * cls_ch_squeeze) + + def forward(self, x): + x = self.conv1(x) + x = self.blocks(x) + x = self.conv2(x) + x = self.pool(x) + return x diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py new file mode 100644 index 0000000000000000000000000000000000000000..447c48f6554c69fec68b77de25e0386cba4aaca8 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py @@ -0,0 +1,234 @@ +import os, sys +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..common import Activation + + +class ConvBNLayer(nn.Module): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='hard_swish'): + super(ConvBNLayer, self).__init__() + self.act = act + self._conv = nn.Conv2d( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + bias=False) + + self._batch_norm = nn.BatchNorm2d( + num_filters, + ) + if self.act is not None: + self._act = Activation(act_type=act, inplace=True) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act is not None: + y = self._act(y) + return y + + +class DepthwiseSeparable(nn.Module): + def __init__(self, + num_channels, + num_filters1, + num_filters2, + num_groups, + stride, + scale, + dw_size=3, + padding=1, + use_se=False): + super(DepthwiseSeparable, self).__init__() + self.use_se = use_se + self._depthwise_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=int(num_filters1 * scale), + filter_size=dw_size, + stride=stride, + padding=padding, + num_groups=int(num_groups * scale)) + if use_se: + self._se = SEModule(int(num_filters1 * scale)) + self._pointwise_conv = ConvBNLayer( + num_channels=int(num_filters1 * scale), + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + + def forward(self, inputs): + y = self._depthwise_conv(inputs) + if self.use_se: + y = self._se(y) + y = self._pointwise_conv(y) + return y + + +class MobileNetV1Enhance(nn.Module): + def __init__(self, + in_channels=3, + scale=0.5, + last_conv_stride=1, + last_pool_type='max', + **kwargs): + super().__init__() + self.scale = scale + self.block_list = [] + + self.conv1 = ConvBNLayer( + num_channels=in_channels, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1) + + conv2_1 = DepthwiseSeparable( + num_channels=int(32 * scale), + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale) + self.block_list.append(conv2_1) + + conv2_2 = DepthwiseSeparable( + num_channels=int(64 * scale), + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=1, + scale=scale) + self.block_list.append(conv2_2) + + conv3_1 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale) + self.block_list.append(conv3_1) + + conv3_2 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=(2, 1), + scale=scale) + self.block_list.append(conv3_2) + + conv4_1 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale) + self.block_list.append(conv4_1) + + conv4_2 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=(2, 1), + scale=scale) + self.block_list.append(conv4_2) + + for _ in range(5): + conv5 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + dw_size=5, + padding=2, + scale=scale, + use_se=False) + self.block_list.append(conv5) + + conv5_6 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=(2, 1), + dw_size=5, + padding=2, + scale=scale, + use_se=True) + self.block_list.append(conv5_6) + + conv6 = DepthwiseSeparable( + num_channels=int(1024 * scale), + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=last_conv_stride, + dw_size=5, + padding=2, + use_se=True, + scale=scale) + self.block_list.append(conv6) + + self.block_list = nn.Sequential(*self.block_list) + if last_pool_type == 'avg': + self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0) + else: + self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + self.out_channels = int(1024 * scale) + + def forward(self, inputs): + y = self.conv1(inputs) + y = self.block_list(y) + y = self.pool(y) + return y + +def hardsigmoid(x): + return F.relu6(x + 3., inplace=True) / 6. + +class SEModule(nn.Module): + def __init__(self, channel, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + bias=True) + self.conv2 = nn.Conv2d( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + bias=True) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = hardsigmoid(outputs) + x = torch.mul(inputs, outputs) + + return x diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..cf976c53a4aae718e88e6a2f42b743f85cc3f433 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_pphgnetv2.py @@ -0,0 +1,1642 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from .rec_donut_swin import DonutSwinModelOutput +from typing import List, Dict, Union, Callable + + +class IdentityBasedConv1x1(nn.Conv2d): + def __init__(self, channels, groups=1): + super(IdentityBasedConv1x1, self).__init__( + in_channels=channels, + out_channels=channels, + kernel_size=1, + stride=1, + padding=0, + groups=groups, + bias_attr=False, + ) + + assert channels % groups == 0 + input_dim = channels // groups + id_value = np.zeros((channels, input_dim, 1, 1)) + for i in range(channels): + id_value[i, i % input_dim, 0, 0] = 1 + self.id_tensor = torch.Tensor(id_value) + self.weight.set_value(torch.zeros_like(self.weight)) + + def forward(self, input): + kernel = self.weight + self.id_tensor + result = F.conv2d( + input, + kernel, + None, + stride=1, + padding=0, + dilation=self._dilation, + groups=self._groups, + ) + return result + + def get_actual_kernel(self): + return self.weight + self.id_tensor + + +class BNAndPad(nn.Module): + def __init__( + self, + pad_pixels, + num_features, + epsilon=1e-5, + momentum=0.1, + last_conv_bias=None, + bn=nn.BatchNorm2d, + ): + super().__init__() + self.bn = bn(num_features, momentum=momentum, epsilon=epsilon) + self.pad_pixels = pad_pixels + self.last_conv_bias = last_conv_bias + + def forward(self, input): + output = self.bn(input) + if self.pad_pixels > 0: + bias = -self.bn._mean + if self.last_conv_bias is not None: + bias += self.last_conv_bias + pad_values = self.bn.bias + self.bn.weight * ( + bias / torch.sqrt(self.bn._variance + self.bn._epsilon) + ) + """ pad """ + # TODO: n,h,w,c format is not supported yet + n, c, h, w = output.shape + values = pad_values.reshape([1, -1, 1, 1]) + w_values = values.expand([n, -1, self.pad_pixels, w]) + x = torch.cat([w_values, output, w_values], dim=2) + h = h + self.pad_pixels * 2 + h_values = values.expand([n, -1, h, self.pad_pixels]) + x = torch.cat([h_values, x, h_values], dim=3) + output = x + return output + + @property + def weight(self): + return self.bn.weight + + @property + def bias(self): + return self.bn.bias + + @property + def _mean(self): + return self.bn._mean + + @property + def _variance(self): + return self.bn._variance + + @property + def _epsilon(self): + return self.bn._epsilon + + +def conv_bn( + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + padding_mode="zeros", +): + conv_layer = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias_attr=False, + padding_mode=padding_mode, + ) + bn_layer = nn.BatchNorm2D(num_features=out_channels) + se = nn.Sequential() + se.add_sublayer("conv", conv_layer) + se.add_sublayer("bn", bn_layer) + return se + + +def transI_fusebn(kernel, bn): + gamma = bn.weight + std = (bn._variance + bn._epsilon).sqrt() + return ( + kernel * ((gamma / std).reshape([-1, 1, 1, 1])), + bn.bias - bn._mean * gamma / std, + ) + + +def transII_addbranch(kernels, biases): + return sum(kernels), sum(biases) + + +def transIII_1x1_kxk(k1, b1, k2, b2, groups): + if groups == 1: + k = F.conv2d(k2, k1.transpose([1, 0, 2, 3])) + b_hat = (k2 * b1.reshape([1, -1, 1, 1])).sum((1, 2, 3)) + else: + k_slices = [] + b_slices = [] + k1_T = k1.transpose([1, 0, 2, 3]) + k1_group_width = k1.shape[0] // groups + k2_group_width = k2.shape[0] // groups + for g in range(groups): + k1_T_slice = k1_T[:, g * k1_group_width : (g + 1) * k1_group_width, :, :] + k2_slice = k2[g * k2_group_width : (g + 1) * k2_group_width, :, :, :] + k_slices.append(F.conv2d(k2_slice, k1_T_slice)) + b_slices.append( + ( + k2_slice + * b1[g * k1_group_width : (g + 1) * k1_group_width].reshape( + [1, -1, 1, 1] + ) + ).sum((1, 2, 3)) + ) + k, b_hat = transIV_depthconcat(k_slices, b_slices) + return k, b_hat + b2 + + +def transIV_depthconcat(kernels, biases): + return torch.cat(kernels, dim=0), torch.cat(biases) + + +def transV_avg(channels, kernel_size, groups): + input_dim = channels // groups + k = torch.zeros((channels, input_dim, kernel_size, kernel_size)) + k[np.arange(channels), np.tile(np.arange(input_dim), groups), :, :] = ( + 1.0 / kernel_size**2 + ) + return k + + +def transVI_multiscale(kernel, target_kernel_size): + H_pixels_to_pad = (target_kernel_size - kernel.shape[2]) // 2 + W_pixels_to_pad = (target_kernel_size - kernel.shape[3]) // 2 + return F.pad( + kernel, [H_pixels_to_pad, H_pixels_to_pad, W_pixels_to_pad, W_pixels_to_pad] + ) + + +class DiverseBranchBlock(nn.Module): + def __init__( + self, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None, + is_repped=False, + single_init=False, + **kwargs, + ): + super().__init__() + + padding = (filter_size - 1) // 2 + dilation = 1 + + in_channels = num_channels + out_channels = num_filters + kernel_size = filter_size + internal_channels_1x1_3x3 = None + nonlinear = act + + self.is_repped = is_repped + + if nonlinear is None: + self.nonlinear = nn.Identity() + else: + self.nonlinear = nn.ReLU() + + self.kernel_size = kernel_size + self.out_channels = out_channels + self.groups = groups + assert padding == kernel_size // 2 + + if is_repped: + self.dbb_reparam = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=True, + ) + else: + self.dbb_origin = conv_bn( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + ) + + self.dbb_avg = nn.Sequential() + if groups < out_channels: + self.dbb_avg.add_sublayer( + "conv", + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + groups=groups, + bias=False, + ), + ) + self.dbb_avg.add_sublayer( + "bn", BNAndPad(pad_pixels=padding, num_features=out_channels) + ) + self.dbb_avg.add_sublayer( + "avg", + nn.AvgPool2D(kernel_size=kernel_size, stride=stride, padding=0), + ) + self.dbb_1x1 = conv_bn( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + padding=0, + groups=groups, + ) + else: + self.dbb_avg.add_sublayer( + "avg", + nn.AvgPool2D( + kernel_size=kernel_size, stride=stride, padding=padding + ), + ) + + self.dbb_avg.add_sublayer("avgbn", nn.BatchNorm2D(out_channels)) + + if internal_channels_1x1_3x3 is None: + internal_channels_1x1_3x3 = ( + in_channels if groups < out_channels else 2 * in_channels + ) # For mobilenet, it is better to have 2X internal channels + + self.dbb_1x1_kxk = nn.Sequential() + if internal_channels_1x1_3x3 == in_channels: + self.dbb_1x1_kxk.add_sublayer( + "idconv1", IdentityBasedConv1x1(channels=in_channels, groups=groups) + ) + else: + self.dbb_1x1_kxk.add_sublayer( + "conv1", + nn.Conv2d( + in_channels=in_channels, + out_channels=internal_channels_1x1_3x3, + kernel_size=1, + stride=1, + padding=0, + groups=groups, + bias=False, + ), + ) + self.dbb_1x1_kxk.add_sublayer( + "bn1", + BNAndPad(pad_pixels=padding, num_features=internal_channels_1x1_3x3), + ) + self.dbb_1x1_kxk.add_sublayer( + "conv2", + nn.Conv2d( + in_channels=internal_channels_1x1_3x3, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=0, + groups=groups, + bias=False, + ), + ) + self.dbb_1x1_kxk.add_sublayer("bn2", nn.BatchNorm2D(out_channels)) + + # The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases. + if single_init: + # Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting. + self.single_init() + + def forward(self, inputs): + if self.is_repped: + return self.nonlinear(self.dbb_reparam(inputs)) + + out = self.dbb_origin(inputs) + if hasattr(self, "dbb_1x1"): + out += self.dbb_1x1(inputs) + out += self.dbb_avg(inputs) + out += self.dbb_1x1_kxk(inputs) + return self.nonlinear(out) + + def init_gamma(self, gamma_value): + if hasattr(self, "dbb_origin"): + torch.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value) + if hasattr(self, "dbb_1x1"): + torch.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value) + if hasattr(self, "dbb_avg"): + torch.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value) + if hasattr(self, "dbb_1x1_kxk"): + torch.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value) + + def single_init(self): + self.init_gamma(0.0) + if hasattr(self, "dbb_origin"): + torch.nn.init.constant_(self.dbb_origin.bn.weight, 1.0) + + def get_equivalent_kernel_bias(self): + k_origin, b_origin = transI_fusebn( + self.dbb_origin.conv.weight, self.dbb_origin.bn + ) + + if hasattr(self, "dbb_1x1"): + k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight, self.dbb_1x1.bn) + k_1x1 = transVI_multiscale(k_1x1, self.kernel_size) + else: + k_1x1, b_1x1 = 0, 0 + + if hasattr(self.dbb_1x1_kxk, "idconv1"): + k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel() + else: + k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight + k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn( + k_1x1_kxk_first, self.dbb_1x1_kxk.bn1 + ) + k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn( + self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2 + ) + k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk( + k_1x1_kxk_first, + b_1x1_kxk_first, + k_1x1_kxk_second, + b_1x1_kxk_second, + groups=self.groups, + ) + + k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups) + k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg, self.dbb_avg.avgbn) + if hasattr(self.dbb_avg, "conv"): + k_1x1_avg_first, b_1x1_avg_first = transI_fusebn( + self.dbb_avg.conv.weight, self.dbb_avg.bn + ) + k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk( + k_1x1_avg_first, + b_1x1_avg_first, + k_1x1_avg_second, + b_1x1_avg_second, + groups=self.groups, + ) + else: + k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second + + return transII_addbranch( + (k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged), + (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged), + ) + + def re_parameterize(self): + if self.is_repped: + return + + kernel, bias = self.get_equivalent_kernel_bias() + self.dbb_reparam = nn.Conv2d( + in_channels=self.dbb_origin.conv._in_channels, + out_channels=self.dbb_origin.conv._out_channels, + kernel_size=self.dbb_origin.conv._kernel_size, + stride=self.dbb_origin.conv._stride, + padding=self.dbb_origin.conv._padding, + dilation=self.dbb_origin.conv._dilation, + groups=self.dbb_origin.conv._groups, + bias=True, + ) + + self.dbb_reparam.weight.set_value(kernel) + self.dbb_reparam.bias.set_value(bias) + + self.__delattr__("dbb_origin") + self.__delattr__("dbb_avg") + if hasattr(self, "dbb_1x1"): + self.__delattr__("dbb_1x1") + self.__delattr__("dbb_1x1_kxk") + self.is_repped = True + + +class Identity(nn.Module): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, inputs): + return inputs + + +class TheseusLayer(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self.res_dict = {} + # self.res_name = self.full_name() + self.res_name = self.__class__.__name__.lower() + self.pruner = None + self.quanter = None + + self.init_net(*args, **kwargs) + + def _return_dict_hook(self, layer, input, output): + res_dict = {"logits": output} + # 'list' is needed to avoid error raised by popping self.res_dict + for res_key in list(self.res_dict): + # clear the res_dict because the forward process may change according to input + res_dict[res_key] = self.res_dict.pop(res_key) + return res_dict + + def init_net( + self, + stages_pattern=None, + return_patterns=None, + return_stages=None, + freeze_befor=None, + stop_after=None, + *args, + **kwargs, + ): + # init the output of net + if return_patterns or return_stages: + if return_patterns and return_stages: + msg = f"The 'return_patterns' would be ignored when 'return_stages' is set." + + return_stages = None + + if return_stages is True: + return_patterns = stages_pattern + + # return_stages is int or bool + if type(return_stages) is int: + return_stages = [return_stages] + if isinstance(return_stages, list): + if max(return_stages) > len(stages_pattern) or min(return_stages) < 0: + msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}." + + return_stages = [ + val + for val in return_stages + if val >= 0 and val < len(stages_pattern) + ] + return_patterns = [stages_pattern[i] for i in return_stages] + + if return_patterns: + # call update_res function after the __init__ of the object has completed execution, that is, the constructing of layer or model has been completed. + def update_res_hook(layer, input): + self.update_res(return_patterns) + + self.register_forward_pre_hook(update_res_hook) + + # freeze subnet + if freeze_befor is not None: + self.freeze_befor(freeze_befor) + + # set subnet to Identity + if stop_after is not None: + self.stop_after(stop_after) + + def init_res(self, stages_pattern, return_patterns=None, return_stages=None): + + if return_patterns and return_stages: + return_stages = None + + if return_stages is True: + return_patterns = stages_pattern + # return_stages is int or bool + if type(return_stages) is int: + return_stages = [return_stages] + if isinstance(return_stages, list): + if max(return_stages) > len(stages_pattern) or min(return_stages) < 0: + return_stages = [ + val + for val in return_stages + if val >= 0 and val < len(stages_pattern) + ] + return_patterns = [stages_pattern[i] for i in return_stages] + + if return_patterns: + self.update_res(return_patterns) + + def replace_sub(self, *args, **kwargs) -> None: + msg = "The function 'replace_sub()' is deprecated, please use 'upgrade_sublayer()' instead." + raise DeprecationWarning(msg) + + def upgrade_sublayer( + self, + layer_name_pattern: Union[str, List[str]], + handle_func: Callable[[nn.Module, str], nn.Module], + ) -> Dict[str, nn.Module]: + """use 'handle_func' to modify the sub-layer(s) specified by 'layer_name_pattern'. + + Args: + layer_name_pattern (Union[str, List[str]]): The name of layer to be modified by 'handle_func'. + handle_func (Callable[[nn.Module, str], nn.Module]): The function to modify target layer specified by 'layer_name_pattern'. The formal params are the layer(nn.Module) and pattern(str) that is (a member of) layer_name_pattern (when layer_name_pattern is List type). And the return is the layer processed. + + Returns: + Dict[str, nn.Module]: The key is the pattern and corresponding value is the result returned by 'handle_func()'. + + Examples: + + from paddle import nn + import paddleclas + + def rep_func(layer: nn.Module, pattern: str): + new_layer = nn.Conv2d( + in_channels=layer._in_channels, + out_channels=layer._out_channels, + kernel_size=5, + padding=2 + ) + return new_layer + + net = paddleclas.MobileNetV1() + res = net.upgrade_sublayer(layer_name_pattern=["blocks[11].depthwise_conv.conv", "blocks[12].depthwise_conv.conv"], handle_func=rep_func) + print(res) + # {'blocks[11].depthwise_conv.conv': the corresponding new_layer, 'blocks[12].depthwise_conv.conv': the corresponding new_layer} + """ + + if not isinstance(layer_name_pattern, list): + layer_name_pattern = [layer_name_pattern] + + hit_layer_pattern_list = [] + for pattern in layer_name_pattern: + # parse pattern to find target layer and its parent + layer_list = parse_pattern_str(pattern=pattern, parent_layer=self) + if not layer_list: + continue + + sub_layer_parent = layer_list[-2]["layer"] if len(layer_list) > 1 else self + sub_layer = layer_list[-1]["layer"] + sub_layer_name = layer_list[-1]["name"] + sub_layer_index_list = layer_list[-1]["index_list"] + + new_sub_layer = handle_func(sub_layer, pattern) + + if sub_layer_index_list: + if len(sub_layer_index_list) > 1: + sub_layer_parent = getattr(sub_layer_parent, sub_layer_name)[ + sub_layer_index_list[0] + ] + for sub_layer_index in sub_layer_index_list[1:-1]: + sub_layer_parent = sub_layer_parent[sub_layer_index] + sub_layer_parent[sub_layer_index_list[-1]] = new_sub_layer + else: + getattr(sub_layer_parent, sub_layer_name)[ + sub_layer_index_list[0] + ] = new_sub_layer + else: + setattr(sub_layer_parent, sub_layer_name, new_sub_layer) + + hit_layer_pattern_list.append(pattern) + return hit_layer_pattern_list + + def stop_after(self, stop_layer_name: str) -> bool: + """stop forward and backward after 'stop_layer_name'. + + Args: + stop_layer_name (str): The name of layer that stop forward and backward after this layer. + + Returns: + bool: 'True' if successful, 'False' otherwise. + """ + + layer_list = parse_pattern_str(stop_layer_name, self) + if not layer_list: + return False + + parent_layer = self + for layer_dict in layer_list: + name, index_list = layer_dict["name"], layer_dict["index_list"] + if not set_identity(parent_layer, name, index_list): + msg = f"Failed to set the layers that after stop_layer_name('{stop_layer_name}') to IdentityLayer. The error layer's name is '{name}'." + return False + parent_layer = layer_dict["layer"] + + return True + + def freeze_befor(self, layer_name: str) -> bool: + """freeze the layer named layer_name and its previous layer. + + Args: + layer_name (str): The name of layer that would be freezed. + + Returns: + bool: 'True' if successful, 'False' otherwise. + """ + + def stop_grad(layer, pattern): + class StopGradLayer(nn.Module): + def __init__(self): + super().__init__() + self.layer = layer + + def forward(self, x): + x = self.layer(x) + x.stop_gradient = True + return x + + new_layer = StopGradLayer() + return new_layer + + res = self.upgrade_sublayer(layer_name, stop_grad) + if len(res) == 0: + msg = "Failed to stop the gradient before the layer named '{layer_name}'" + return False + return True + + def update_res(self, return_patterns: Union[str, List[str]]) -> Dict[str, nn.Module]: + """update the result(s) to be returned. + + Args: + return_patterns (Union[str, List[str]]): The name of layer to return output. + + Returns: + Dict[str, nn.Module]: The pattern(str) and corresponding layer(nn.Module) that have been set successfully. + """ + + # clear res_dict that could have been set + self.res_dict = {} + + class Handler(object): + def __init__(self, res_dict): + # res_dict is a reference + self.res_dict = res_dict + + def __call__(self, layer, pattern): + layer.res_dict = self.res_dict + layer.res_name = pattern + if hasattr(layer, "hook_remove_helper"): + layer.hook_remove_helper.remove() + layer.hook_remove_helper = layer.register_forward_post_hook( + save_sub_res_hook + ) + return layer + + handle_func = Handler(self.res_dict) + + hit_layer_pattern_list = self.upgrade_sublayer( + return_patterns, handle_func=handle_func + ) + + if hasattr(self, "hook_remove_helper"): + self.hook_remove_helper.remove() + self.hook_remove_helper = self.register_forward_post_hook( + self._return_dict_hook + ) + + return hit_layer_pattern_list + + +def save_sub_res_hook(layer, input, output): + layer.res_dict[layer.res_name] = output + + +def set_identity( + parent_layer: nn.Module, layer_name: str, layer_index_list: str = None +) -> bool: + """set the layer specified by layer_name and layer_index_list to Identity. + + Args: + parent_layer (nn.Module): The parent layer of target layer specified by layer_name and layer_index_list. + layer_name (str): The name of target layer to be set to Identity. + layer_index_list (str, optional): The index of target layer to be set to Identity in parent_layer. Defaults to None. + + Returns: + bool: True if successfully, False otherwise. + """ + + stop_after = False + for sub_layer_name in parent_layer._sub_layers: + if stop_after: + parent_layer._sub_layers[sub_layer_name] = Identity() + continue + if sub_layer_name == layer_name: + stop_after = True + + if layer_index_list and stop_after: + layer_container = parent_layer._sub_layers[layer_name] + for num, layer_index in enumerate(layer_index_list): + stop_after = False + for i in range(num): + layer_container = layer_container[layer_index_list[i]] + for sub_layer_index in layer_container._sub_layers: + if stop_after: + parent_layer._sub_layers[layer_name][sub_layer_index] = Identity() + continue + if layer_index == sub_layer_index: + stop_after = True + + return stop_after + + +def parse_pattern_str( + pattern: str, parent_layer: nn.Module +) -> Union[None, List[Dict[str, Union[nn.Module, str, None]]]]: + """parse the string type pattern. + + Args: + pattern (str): The pattern to describe layer. + parent_layer (nn.Module): The root layer relative to the pattern. + + Returns: + Union[None, List[Dict[str, Union[nn.Module, str, None]]]]: None if failed. If successfully, the members are layers parsed in order: + [ + {"layer": first layer, "name": first layer's name parsed, "index": first layer's index parsed if exist}, + {"layer": second layer, "name": second layer's name parsed, "index": second layer's index parsed if exist}, + ... + ] + """ + + pattern_list = pattern.split(".") + if not pattern_list: + msg = f"The pattern('{pattern}') is illegal. Please check and retry." + return None + + layer_list = [] + while len(pattern_list) > 0: + if "[" in pattern_list[0]: + target_layer_name = pattern_list[0].split("[")[0] + target_layer_index_list = list( + index.split("]")[0] for index in pattern_list[0].split("[")[1:] + ) + else: + target_layer_name = pattern_list[0] + target_layer_index_list = None + + target_layer = getattr(parent_layer, target_layer_name, None) + + if target_layer is None: + msg = f"Not found layer named('{target_layer_name}') specified in pattern('{pattern}')." + return None + + if target_layer_index_list: + for target_layer_index in target_layer_index_list: + if int(target_layer_index) < 0 or int(target_layer_index) >= len( + target_layer + ): + msg = f"Not found layer by index('{target_layer_index}') specified in pattern('{pattern}'). The index should < {len(target_layer)} and > 0." + return None + target_layer = target_layer[target_layer_index] + + layer_list.append( + { + "layer": target_layer, + "name": target_layer_name, + "index_list": target_layer_index_list, + } + ) + + pattern_list = pattern_list[1:] + parent_layer = target_layer + + return layer_list + + +class LearnableAffineBlock(TheseusLayer): + """ + Create a learnable affine block module. This module can significantly improve accuracy on smaller models. + + Args: + scale_value (float): The initial value of the scale parameter, default is 1.0. + bias_value (float): The initial value of the bias parameter, default is 0.0. + lr_mult (float): The learning rate multiplier, default is 1.0. + lab_lr (float): The learning rate, default is 0.01. + """ + + def __init__(self, scale_value=1.0, bias_value=0.0, lr_mult=1.0, lab_lr=0.01): + super().__init__() + # self.scale = self.create_parameter( + # shape=[ + # 1, + # ], + # default_initializer=nn.init.Constant(value=scale_value), + # # attr=ParamAttr(learning_rate=lr_mult * lab_lr), + # ) + # self.add_parameter("scale", self.scale) + self.scale = torch.Parameter( + nn.init.constant_( + torch.ones(1).to(torch.float32), val=scale_value + ) + ) + self.register_parameter("scale", self.scale) + + # self.bias = self.create_parameter( + # shape=[ + # 1, + # ], + # default_initializer=nn.init.Constant(value=bias_value), + # # attr=ParamAttr(learning_rate=lr_mult * lab_lr), + # ) + # self.add_parameter("bias", self.bias) + self.bias = torch.Parameter( + nn.init.constant_( + torch.ones(1).to(torch.float32), val=bias_value + ) + ) + self.register_parameter("bias", self.bias) + + def forward(self, x): + return self.scale * x + self.bias + + +class ConvBNAct(TheseusLayer): + """ + ConvBNAct is a combination of convolution and batchnorm layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int): Size of the convolution kernel. Defaults to 3. + stride (int): Stride of the convolution. Defaults to 1. + padding (int/str): Padding or padding type for the convolution. Defaults to 1. + groups (int): Number of groups for the convolution. Defaults to 1. + use_act: (bool): Whether to use activation function. Defaults to True. + use_lab (bool): Whether to use the LAB operation. Defaults to False. + lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + groups=1, + use_act=True, + use_lab=False, + lr_mult=1.0, + ): + super().__init__() + self.use_act = use_act + self.use_lab = use_lab + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride, + padding=padding if isinstance(padding, str) else (kernel_size - 1) // 2, + groups=groups, + bias=False, + ) + self.bn = nn.BatchNorm2d( + out_channels, + ) + if self.use_act: + self.act = nn.ReLU() + if self.use_lab: + self.lab = LearnableAffineBlock(lr_mult=lr_mult) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + if self.use_act: + x = self.act(x) + if self.use_lab: + x = self.lab(x) + return x + + +class LightConvBNAct(TheseusLayer): + """ + LightConvBNAct is a combination of pw and dw layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + kernel_size (int): Size of the depth-wise convolution kernel. + use_lab (bool): Whether to use the LAB operation. Defaults to False. + lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + use_lab=False, + lr_mult=1.0, + **kwargs, + ): + super().__init__() + self.conv1 = ConvBNAct( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + use_act=False, + use_lab=use_lab, + lr_mult=lr_mult, + ) + self.conv2 = ConvBNAct( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=kernel_size, + groups=out_channels, + use_act=True, + use_lab=use_lab, + lr_mult=lr_mult, + ) + + def forward(self, x): + x = self.conv1(x) + x = self.conv2(x) + return x + + +class PaddingSameAsPaddleMaxPool2d(torch.nn.Module): + def __init__(self, kernel_size, stride=1): + super().__init__() + self.kernel_size = kernel_size + self.stride = stride + self.pool = torch.nn.MaxPool2d(kernel_size, stride, padding=0, ceil_mode=True) + + def forward(self, x): + _, _, h, w = x.shape + pad_h_total = max(0, (math.ceil(h / self.stride) - 1) * self.stride + self.kernel_size - h) + pad_w_total = max(0, (math.ceil(w / self.stride) - 1) * self.stride + self.kernel_size - w) + pad_h = pad_h_total // 2 + pad_w = pad_w_total // 2 + x = torch.nn.functional.pad(x, [pad_w, pad_w_total - pad_w, pad_h, pad_h_total - pad_h]) + return self.pool(x) + + +class StemBlock(TheseusLayer): + """ + StemBlock for PP-HGNetV2. + + Args: + in_channels (int): Number of input channels. + mid_channels (int): Number of middle channels. + out_channels (int): Number of output channels. + use_lab (bool): Whether to use the LAB operation. Defaults to False. + lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0. + """ + + def __init__( + self, + in_channels, + mid_channels, + out_channels, + use_lab=False, + lr_mult=1.0, + text_rec=False, + ): + super().__init__() + self.stem1 = ConvBNAct( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=3, + stride=2, + use_lab=use_lab, + lr_mult=lr_mult, + ) + self.stem2a = ConvBNAct( + in_channels=mid_channels, + out_channels=mid_channels // 2, + kernel_size=2, + stride=1, + padding="same", + use_lab=use_lab, + lr_mult=lr_mult, + ) + self.stem2b = ConvBNAct( + in_channels=mid_channels // 2, + out_channels=mid_channels, + kernel_size=2, + stride=1, + padding="same", + use_lab=use_lab, + lr_mult=lr_mult, + ) + self.stem3 = ConvBNAct( + in_channels=mid_channels * 2, + out_channels=mid_channels, + kernel_size=3, + stride=1 if text_rec else 2, + use_lab=use_lab, + lr_mult=lr_mult, + ) + self.stem4 = ConvBNAct( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + use_lab=use_lab, + lr_mult=lr_mult, + ) + self.pool = PaddingSameAsPaddleMaxPool2d( + kernel_size=2, stride=1, + ) + + def forward(self, x): + x = self.stem1(x) + x2 = self.stem2a(x) + x2 = self.stem2b(x2) + x1 = self.pool(x) + x = torch.cat([x1, x2], 1) + x = self.stem3(x) + x = self.stem4(x) + + return x + + +class HGV2_Block(TheseusLayer): + """ + HGV2_Block, the basic unit that constitutes the HGV2_Stage. + + Args: + in_channels (int): Number of input channels. + mid_channels (int): Number of middle channels. + out_channels (int): Number of output channels. + kernel_size (int): Size of the convolution kernel. Defaults to 3. + layer_num (int): Number of layers in the HGV2 block. Defaults to 6. + stride (int): Stride of the convolution. Defaults to 1. + padding (int/str): Padding or padding type for the convolution. Defaults to 1. + groups (int): Number of groups for the convolution. Defaults to 1. + use_act (bool): Whether to use activation function. Defaults to True. + use_lab (bool): Whether to use the LAB operation. Defaults to False. + lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0. + """ + + def __init__( + self, + in_channels, + mid_channels, + out_channels, + kernel_size=3, + layer_num=6, + identity=False, + light_block=True, + use_lab=False, + lr_mult=1.0, + ): + super().__init__() + self.identity = identity + + self.layers = nn.ModuleList() + block_type = "LightConvBNAct" if light_block else "ConvBNAct" + for i in range(layer_num): + self.layers.append( + eval(block_type)( + in_channels=in_channels if i == 0 else mid_channels, + out_channels=mid_channels, + stride=1, + kernel_size=kernel_size, + use_lab=use_lab, + lr_mult=lr_mult, + ) + ) + # feature aggregation + total_channels = in_channels + layer_num * mid_channels + self.aggregation_squeeze_conv = ConvBNAct( + in_channels=total_channels, + out_channels=out_channels // 2, + kernel_size=1, + stride=1, + use_lab=use_lab, + lr_mult=lr_mult, + ) + self.aggregation_excitation_conv = ConvBNAct( + in_channels=out_channels // 2, + out_channels=out_channels, + kernel_size=1, + stride=1, + use_lab=use_lab, + lr_mult=lr_mult, + ) + + def forward(self, x): + identity = x + output = [] + output.append(x) + for layer in self.layers: + x = layer(x) + output.append(x) + x = torch.cat(output, dim=1) + x = self.aggregation_squeeze_conv(x) + x = self.aggregation_excitation_conv(x) + if self.identity: + x += identity + return x + + +class HGV2_Stage(TheseusLayer): + """ + HGV2_Stage, the basic unit that constitutes the PPHGNetV2. + + Args: + in_channels (int): Number of input channels. + mid_channels (int): Number of middle channels. + out_channels (int): Number of output channels. + block_num (int): Number of blocks in the HGV2 stage. + layer_num (int): Number of layers in the HGV2 block. Defaults to 6. + is_downsample (bool): Whether to use downsampling operation. Defaults to False. + light_block (bool): Whether to use light block. Defaults to True. + kernel_size (int): Size of the convolution kernel. Defaults to 3. + use_lab (bool, optional): Whether to use the LAB operation. Defaults to False. + lr_mult (float, optional): Learning rate multiplier for the layer. Defaults to 1.0. + """ + + def __init__( + self, + in_channels, + mid_channels, + out_channels, + block_num, + layer_num=6, + is_downsample=True, + light_block=True, + kernel_size=3, + use_lab=False, + stride=2, + lr_mult=1.0, + ): + + super().__init__() + self.is_downsample = is_downsample + if self.is_downsample: + self.downsample = ConvBNAct( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + stride=stride, + groups=in_channels, + use_act=False, + use_lab=use_lab, + lr_mult=lr_mult, + ) + + blocks_list = [] + for i in range(block_num): + blocks_list.append( + HGV2_Block( + in_channels=in_channels if i == 0 else out_channels, + mid_channels=mid_channels, + out_channels=out_channels, + kernel_size=kernel_size, + layer_num=layer_num, + identity=False if i == 0 else True, + light_block=light_block, + use_lab=use_lab, + lr_mult=lr_mult, + ) + ) + self.blocks = nn.Sequential(*blocks_list) + + def forward(self, x): + if self.is_downsample: + x = self.downsample(x) + x = self.blocks(x) + return x + + +class PPHGNetV2(TheseusLayer): + """ + PPHGNetV2 + + Args: + stage_config (dict): Config for PPHGNetV2 stages. such as the number of channels, stride, etc. + stem_channels: (list): Number of channels of the stem of the PPHGNetV2. + use_lab (bool): Whether to use the LAB operation. Defaults to False. + use_last_conv (bool): Whether to use the last conv layer as the output channel. Defaults to True. + class_expand (int): Number of channels for the last 1x1 convolutional layer. + drop_prob (float): Dropout probability for the last 1x1 convolutional layer. Defaults to 0.0. + class_num (int): The number of classes for the classification layer. Defaults to 1000. + lr_mult_list (list): Learning rate multiplier for the stages. Defaults to [1.0, 1.0, 1.0, 1.0, 1.0]. + Returns: + model: nn.Module. Specific PPHGNetV2 model depends on args. + """ + + def __init__( + self, + stage_config, + stem_channels=[3, 32, 64], + use_lab=False, + use_last_conv=True, + class_expand=2048, + dropout_prob=0.0, + class_num=1000, + lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], + det=False, + text_rec=False, + out_indices=None, + **kwargs, + ): + super().__init__() + self.det = det + self.text_rec = text_rec + self.use_lab = use_lab + self.use_last_conv = use_last_conv + self.class_expand = class_expand + self.class_num = class_num + self.out_indices = out_indices if out_indices is not None else [0, 1, 2, 3] + self.out_channels = [] + + # stem + self.stem = StemBlock( + in_channels=stem_channels[0], + mid_channels=stem_channels[1], + out_channels=stem_channels[2], + use_lab=use_lab, + lr_mult=lr_mult_list[0], + text_rec=text_rec, + ) + + # stages + self.stages = nn.ModuleList() + for i, k in enumerate(stage_config): + ( + in_channels, + mid_channels, + out_channels, + block_num, + is_downsample, + light_block, + kernel_size, + layer_num, + stride, + ) = stage_config[k] + self.stages.append( + HGV2_Stage( + in_channels, + mid_channels, + out_channels, + block_num, + layer_num, + is_downsample, + light_block, + kernel_size, + use_lab, + stride, + lr_mult=lr_mult_list[i + 1], + ) + ) + if i in self.out_indices: + self.out_channels.append(out_channels) + if not self.det: + self.out_channels = stage_config["stage4"][2] + + self.avg_pool = nn.AdaptiveAvgPool2d(1) + + if self.use_last_conv: + self.last_conv = nn.Conv2d( + in_channels=out_channels, + out_channels=self.class_expand, + kernel_size=1, + stride=1, + padding=0, + bias=False, + ) + self.act = nn.ReLU() + if self.use_lab: + self.lab = LearnableAffineBlock() + # self.dropout = nn.Dropout(p=dropout_prob, mode="downscale_in_infer") + self.dropout = nn.Dropout(p=dropout_prob) + + self.flatten = nn.Flatten(start_dim=1, end_dim=-1) + if not self.det: + self.fc = nn.Linear( + self.class_expand if self.use_last_conv else out_channels, + self.class_num, + ) + + self._init_weights() + + def _init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, (nn.BatchNorm2d)): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.zeros_(m.bias) + + def forward(self, x): + x = self.stem(x) + out = [] + for i, stage in enumerate(self.stages): + x = stage(x) + if self.det and i in self.out_indices: + out.append(x) + if self.det: + return out + + if self.text_rec: + if self.training: + x = F.adaptive_avg_pool2d(x, [1, 40]) + else: + x = F.avg_pool2d(x, [3, 2]) + return x + + +def PPHGNetV2_B0(pretrained=False, use_ssld=False, **kwargs): + """ + PPHGNetV2_B0 + Args: + pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld (bool) Whether using ssld pretrained model when pretrained is True. + Returns: + model: nn.Module. Specific `PPHGNetV2_B0` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num + "stage1": [16, 16, 64, 1, False, False, 3, 3], + "stage2": [64, 32, 256, 1, True, False, 3, 3], + "stage3": [256, 64, 512, 2, True, True, 5, 3], + "stage4": [512, 128, 1024, 1, True, True, 5, 3], + } + + model = PPHGNetV2( + stem_channels=[3, 16, 16], stage_config=stage_config, use_lab=True, **kwargs + ) + return model + + +def PPHGNetV2_B1(pretrained=False, use_ssld=False, **kwargs): + """ + PPHGNetV2_B1 + Args: + pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld (bool) Whether using ssld pretrained model when pretrained is True. + Returns: + model: nn.Module. Specific `PPHGNetV2_B1` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num + "stage1": [32, 32, 64, 1, False, False, 3, 3], + "stage2": [64, 48, 256, 1, True, False, 3, 3], + "stage3": [256, 96, 512, 2, True, True, 5, 3], + "stage4": [512, 192, 1024, 1, True, True, 5, 3], + } + + model = PPHGNetV2( + stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs + ) + return model + + +def PPHGNetV2_B2(pretrained=False, use_ssld=False, **kwargs): + """ + PPHGNetV2_B2 + Args: + pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld (bool) Whether using ssld pretrained model when pretrained is True. + Returns: + model: nn.Module. Specific `PPHGNetV2_B2` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num + "stage1": [32, 32, 96, 1, False, False, 3, 4], + "stage2": [96, 64, 384, 1, True, False, 3, 4], + "stage3": [384, 128, 768, 3, True, True, 5, 4], + "stage4": [768, 256, 1536, 1, True, True, 5, 4], + } + + model = PPHGNetV2( + stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs + ) + return model + + +def PPHGNetV2_B3(pretrained=False, use_ssld=False, **kwargs): + """ + PPHGNetV2_B3 + Args: + pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld (bool) Whether using ssld pretrained model when pretrained is True. + Returns: + model: nn.Module. Specific `PPHGNetV2_B3` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num + "stage1": [32, 32, 128, 1, False, False, 3, 5], + "stage2": [128, 64, 512, 1, True, False, 3, 5], + "stage3": [512, 128, 1024, 3, True, True, 5, 5], + "stage4": [1024, 256, 2048, 1, True, True, 5, 5], + } + + model = PPHGNetV2( + stem_channels=[3, 24, 32], stage_config=stage_config, use_lab=True, **kwargs + ) + return model + + +def PPHGNetV2_B4(pretrained=False, use_ssld=False, det=False, text_rec=False, **kwargs): + """ + PPHGNetV2_B4 + Args: + pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld (bool) Whether using ssld pretrained model when pretrained is True. + Returns: + model: nn.Module. Specific `PPHGNetV2_B4` model depends on args. + """ + stage_config_rec = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num, stride + "stage1": [48, 48, 128, 1, True, False, 3, 6, [2, 1]], + "stage2": [128, 96, 512, 1, True, False, 3, 6, [1, 2]], + "stage3": [512, 192, 1024, 3, True, True, 5, 6, [2, 1]], + "stage4": [1024, 384, 2048, 1, True, True, 5, 6, [2, 1]], + } + + stage_config_det = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num + "stage1": [48, 48, 128, 1, False, False, 3, 6, 2], + "stage2": [128, 96, 512, 1, True, False, 3, 6, 2], + "stage3": [512, 192, 1024, 3, True, True, 5, 6, 2], + "stage4": [1024, 384, 2048, 1, True, True, 5, 6, 2], + } + model = PPHGNetV2( + stem_channels=[3, 32, 48], + stage_config=stage_config_det if det else stage_config_rec, + use_lab=False, + det=det, + text_rec=text_rec, + **kwargs, + ) + return model + + +def PPHGNetV2_B5(pretrained=False, use_ssld=False, **kwargs): + """ + PPHGNetV2_B5 + Args: + pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld (bool) Whether using ssld pretrained model when pretrained is True. + Returns: + model: nn.Module. Specific `PPHGNetV2_B5` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num + "stage1": [64, 64, 128, 1, False, False, 3, 6], + "stage2": [128, 128, 512, 2, True, False, 3, 6], + "stage3": [512, 256, 1024, 5, True, True, 5, 6], + "stage4": [1024, 512, 2048, 2, True, True, 5, 6], + } + + model = PPHGNetV2( + stem_channels=[3, 32, 64], stage_config=stage_config, use_lab=False, **kwargs + ) + return model + + +def PPHGNetV2_B6(pretrained=False, use_ssld=False, **kwargs): + """ + PPHGNetV2_B6 + Args: + pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise. + If str, means the path of the pretrained model. + use_ssld (bool) Whether using ssld pretrained model when pretrained is True. + Returns: + model: nn.Module. Specific `PPHGNetV2_B6` model depends on args. + """ + stage_config = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num + "stage1": [96, 96, 192, 2, False, False, 3, 6], + "stage2": [192, 192, 512, 3, True, False, 3, 6], + "stage3": [512, 384, 1024, 6, True, True, 5, 6], + "stage4": [1024, 768, 2048, 3, True, True, 5, 6], + } + + model = PPHGNetV2( + stem_channels=[3, 48, 96], stage_config=stage_config, use_lab=False, **kwargs + ) + return model + + +class PPHGNetV2_B4_Formula(nn.Module): + """ + PPHGNetV2_B4_Formula + Args: + in_channels (int): Number of input channels. Default is 3 (for RGB images). + class_num (int): Number of classes for classification. Default is 1000. + Returns: + model: nn.Module. Specific `PPHGNetV2_B4` model with defined architecture. + """ + + def __init__(self, in_channels=3, class_num=1000): + super().__init__() + self.in_channels = in_channels + self.out_channels = 2048 + stage_config = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num + "stage1": [48, 48, 128, 1, False, False, 3, 6, 2], + "stage2": [128, 96, 512, 1, True, False, 3, 6, 2], + "stage3": [512, 192, 1024, 3, True, True, 5, 6, 2], + "stage4": [1024, 384, 2048, 1, True, True, 5, 6, 2], + } + + self.pphgnet_b4 = PPHGNetV2( + stem_channels=[3, 32, 48], + stage_config=stage_config, + class_num=class_num, + use_lab=False, + ) + + def forward(self, input_data): + if self.training: + pixel_values, label, attention_mask = input_data + else: + if isinstance(input_data, list): + pixel_values = input_data[0] + else: + pixel_values = input_data + num_channels = pixel_values.shape[1] + if num_channels == 1: + pixel_values = torch.repeat_interleave(pixel_values, repeats=3, dim=1) + pphgnet_b4_output = self.pphgnet_b4(pixel_values) + b, c, h, w = pphgnet_b4_output.shape + pphgnet_b4_output = pphgnet_b4_output.reshape([b, c, h * w]).transpose( + [0, 2, 1] + ) + pphgnet_b4_output = DonutSwinModelOutput( + last_hidden_state=pphgnet_b4_output, + pooler_output=None, + hidden_states=None, + attentions=False, + reshaped_hidden_states=None, + ) + if self.training: + return pphgnet_b4_output, label, attention_mask + else: + return pphgnet_b4_output + + +class PPHGNetV2_B6_Formula(nn.Module): + """ + PPHGNetV2_B6_Formula + Args: + in_channels (int): Number of input channels. Default is 3 (for RGB images). + class_num (int): Number of classes for classification. Default is 1000. + Returns: + model: nn.Module. Specific `PPHGNetV2_B6` model with defined architecture. + """ + + def __init__(self, in_channels=3, class_num=1000): + super().__init__() + self.in_channels = in_channels + self.out_channels = 2048 + stage_config = { + # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num + "stage1": [96, 96, 192, 2, False, False, 3, 6, 2], + "stage2": [192, 192, 512, 3, True, False, 3, 6, 2], + "stage3": [512, 384, 1024, 6, True, True, 5, 6, 2], + "stage4": [1024, 768, 2048, 3, True, True, 5, 6, 2], + } + + self.pphgnet_b6 = PPHGNetV2( + stem_channels=[3, 48, 96], + class_num=class_num, + stage_config=stage_config, + use_lab=False, + ) + + def forward(self, input_data): + if self.training: + pixel_values, label, attention_mask = input_data + else: + if isinstance(input_data, list): + pixel_values = input_data[0] + else: + pixel_values = input_data + num_channels = pixel_values.shape[1] + if num_channels == 1: + pixel_values = torch.repeat_interleave(pixel_values, repeats=3, dim=1) + pphgnet_b6_output = self.pphgnet_b6(pixel_values) + b, c, h, w = pphgnet_b6_output.shape + pphgnet_b6_output = pphgnet_b6_output.reshape([b, c, h * w]).transpose( + [0, 2, 1] + ) + pphgnet_b6_output = DonutSwinModelOutput( + last_hidden_state=pphgnet_b6_output, + pooler_output=None, + hidden_states=None, + attentions=False, + reshaped_hidden_states=None, + ) + if self.training: + return pphgnet_b6_output, label, attention_mask + else: + return pphgnet_b6_output diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..3a117736d9456723055a83e5e0195267d1be513a --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py @@ -0,0 +1,638 @@ +import numpy as np +import torch +from torch import nn + +from ..common import Activation + + +def drop_path(x, drop_prob=0.0, training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = torch.as_tensor(1 - drop_prob) + shape = (x.shape[0],) + (1,) * (x.ndim - 1) + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype) + random_tensor = torch.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class ConvBNLayer(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + bias_attr=False, + groups=1, + act="gelu", + ): + super().__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=bias_attr, + ) + self.norm = nn.BatchNorm2d(out_channels) + self.act = Activation(act_type=act, inplace=True) + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + out = self.act(out) + return out + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Module): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer="gelu", + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = Activation(act_type=act_layer, inplace=True) + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class ConvMixer(nn.Module): + def __init__( + self, + dim, + num_heads=8, + HW=[8, 25], + local_k=[3, 3], + ): + super().__init__() + self.HW = HW + self.dim = dim + self.local_mixer = nn.Conv2d( + dim, + dim, + local_k, + 1, + [local_k[0] // 2, local_k[1] // 2], + groups=num_heads, + ) + + def forward(self, x): + h = self.HW[0] + w = self.HW[1] + x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w]) + x = self.local_mixer(x) + x = x.flatten(2).permute(0, 2, 1) + return x + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + mixer="Global", + HW=[8, 25], + local_k=[7, 11], + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.HW = HW + if HW is not None: + H = HW[0] + W = HW[1] + self.N = H * W + self.C = dim + if mixer == "Local" and HW is not None: + hk = local_k[0] + wk = local_k[1] + mask = torch.ones(H * W, H + hk - 1, W + wk - 1, dtype=torch.float32) + for h in range(0, H): + for w in range(0, W): + mask[h * W + w, h : h + hk, w : w + wk] = 0.0 + mask_paddle = mask[:, hk // 2 : H + hk // 2, wk // 2 : W + wk // 2].flatten( + 1 + ) + mask_inf = torch.full( + [H * W, H * W], fill_value=float("-Inf"), dtype=torch.float32 + ) + mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf) + self.mask = mask.unsqueeze(0).unsqueeze(1) + # self.mask = mask[None, None, :] + self.mixer = mixer + + def forward(self, x): + if self.HW is not None: + N = self.N + C = self.C + else: + _, N, C = x.shape + qkv = self.qkv(x) + qkv = qkv.reshape((-1, N, 3, self.num_heads, C // self.num_heads)).permute( + 2, 0, 3, 1, 4 + ) + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + + attn = q.matmul(k.permute(0, 1, 3, 2)) + if self.mixer == "Local": + attn += self.mask + attn = nn.functional.softmax(attn, dim=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).permute(0, 2, 1, 3).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mixer="Global", + local_mixer=[7, 11], + HW=None, + mlp_ratio=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer="gelu", + norm_layer="nn.LayerNorm", + epsilon=1e-6, + prenorm=True, + ): + super().__init__() + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, eps=epsilon) + else: + self.norm1 = norm_layer(dim) + if mixer == "Global" or mixer == "Local": + self.mixer = Attention( + dim, + num_heads=num_heads, + mixer=mixer, + HW=HW, + local_k=local_mixer, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + elif mixer == "Conv": + self.mixer = ConvMixer(dim, num_heads=num_heads, HW=HW, local_k=local_mixer) + else: + raise TypeError("The mixer must be one of [Global, Local, Conv]") + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, eps=epsilon) + else: + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp_ratio = mlp_ratio + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + self.prenorm = prenorm + + def forward(self, x): + if self.prenorm: + x = self.norm1(x + self.drop_path(self.mixer(x))) + x = self.norm2(x + self.drop_path(self.mlp(x))) + else: + x = x + self.drop_path(self.mixer(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding""" + + def __init__( + self, + img_size=[32, 100], + in_channels=3, + embed_dim=768, + sub_num=2, + patch_size=[4, 4], + mode="pope", + ): + super().__init__() + num_patches = (img_size[1] // (2**sub_num)) * (img_size[0] // (2**sub_num)) + self.img_size = img_size + self.num_patches = num_patches + self.embed_dim = embed_dim + self.norm = None + if mode == "pope": + if sub_num == 2: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act="gelu", + bias_attr=True, + ), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act="gelu", + bias_attr=True, + ), + ) + if sub_num == 3: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 4, + kernel_size=3, + stride=2, + padding=1, + act="gelu", + bias_attr=True, + ), + ConvBNLayer( + in_channels=embed_dim // 4, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act="gelu", + bias_attr=True, + ), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act="gelu", + bias_attr=True, + ), + ) + elif mode == "linear": + self.proj = nn.Conv2d( + 1, embed_dim, kernel_size=patch_size, stride=patch_size + ) + self.num_patches = ( + img_size[0] // patch_size[0] * img_size[1] // patch_size[1] + ) + + def forward(self, x): + B, C, H, W = x.shape + assert ( + H == self.img_size[0] and W == self.img_size[1] + ), "Input image size ({}*{}) doesn't match model ({}*{}).".format( + H, W, self.img_size[0], self.img_size[1] + ) + x = self.proj(x).flatten(2).permute(0, 2, 1) + return x + + +class SubSample(nn.Module): + def __init__( + self, + in_channels, + out_channels, + types="Pool", + stride=[2, 1], + sub_norm="nn.LayerNorm", + act=None, + ): + super().__init__() + self.types = types + if types == "Pool": + self.avgpool = nn.AvgPool2d( + kernel_size=[3, 5], stride=stride, padding=[1, 2] + ) + self.maxpool = nn.MaxPool2d( + kernel_size=[3, 5], stride=stride, padding=[1, 2] + ) + self.proj = nn.Linear(in_channels, out_channels) + else: + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + ) + self.norm = eval(sub_norm)(out_channels) + if act is not None: + self.act = act() + else: + self.act = None + + def forward(self, x): + if self.types == "Pool": + x1 = self.avgpool(x) + x2 = self.maxpool(x) + x = (x1 + x2) * 0.5 + out = self.proj(x.flatten(2).permute(0, 2, 1)) + else: + x = self.conv(x) + out = x.flatten(2).permute(0, 2, 1) + out = self.norm(out) + if self.act is not None: + out = self.act(out) + + return out + + +class SVTRNet(nn.Module): + def __init__( + self, + img_size=[32, 100], + in_channels=3, + embed_dim=[64, 128, 256], + depth=[3, 6, 3], + num_heads=[2, 4, 8], + mixer=["Local"] * 6 + ["Global"] * 6, # Local atten, Global atten, Conv + local_mixer=[[7, 11], [7, 11], [7, 11]], + patch_merging="Conv", # Conv, Pool, None + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + last_drop=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.1, + norm_layer="nn.LayerNorm", + sub_norm="nn.LayerNorm", + epsilon=1e-6, + out_channels=192, + out_char_num=25, + block_unit="Block", + act="gelu", + last_stage=True, + sub_num=2, + prenorm=True, + use_lenhead=False, + **kwargs + ): + super().__init__() + self.img_size = img_size + self.embed_dim = embed_dim + self.out_channels = out_channels + self.prenorm = prenorm + patch_merging = ( + None + if patch_merging != "Conv" and patch_merging != "Pool" + else patch_merging + ) + self.patch_embed = PatchEmbed( + img_size=img_size, + in_channels=in_channels, + embed_dim=embed_dim[0], + sub_num=sub_num, + ) + num_patches = self.patch_embed.num_patches + self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)] + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0])) + self.pos_drop = nn.Dropout(p=drop_rate) + Block_unit = eval(block_unit) + + dpr = np.linspace(0, drop_path_rate, sum(depth)) + self.blocks1 = nn.ModuleList( + [ + Block_unit( + dim=embed_dim[0], + num_heads=num_heads[0], + mixer=mixer[0 : depth[0]][i], + HW=self.HW, + local_mixer=local_mixer[0], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=act, + attn_drop=attn_drop_rate, + drop_path=dpr[0 : depth[0]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm, + ) + for i in range(depth[0]) + ] + ) + if patch_merging is not None: + self.sub_sample1 = SubSample( + embed_dim[0], + embed_dim[1], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging, + ) + HW = [self.HW[0] // 2, self.HW[1]] + else: + HW = self.HW + self.patch_merging = patch_merging + self.blocks2 = nn.ModuleList( + [ + Block_unit( + dim=embed_dim[1], + num_heads=num_heads[1], + mixer=mixer[depth[0] : depth[0] + depth[1]][i], + HW=HW, + local_mixer=local_mixer[1], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=act, + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0] : depth[0] + depth[1]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm, + ) + for i in range(depth[1]) + ] + ) + if patch_merging is not None: + self.sub_sample2 = SubSample( + embed_dim[1], + embed_dim[2], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging, + ) + HW = [self.HW[0] // 4, self.HW[1]] + else: + HW = self.HW + self.blocks3 = nn.ModuleList( + [ + Block_unit( + dim=embed_dim[2], + num_heads=num_heads[2], + mixer=mixer[depth[0] + depth[1] :][i], + HW=HW, + local_mixer=local_mixer[2], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=act, + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0] + depth[1] :][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm, + ) + for i in range(depth[2]) + ] + ) + self.last_stage = last_stage + if last_stage: + self.avg_pool = nn.AdaptiveAvgPool2d([1, out_char_num]) + self.last_conv = nn.Conv2d( + in_channels=embed_dim[2], + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False, + ) + self.hardswish = Activation("hard_swish", inplace=True) # nn.Hardswish() + # self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer") + self.dropout = nn.Dropout(p=last_drop) + if not prenorm: + self.norm = eval(norm_layer)(embed_dim[-1], eps=epsilon) + self.use_lenhead = use_lenhead + if use_lenhead: + self.len_conv = nn.Linear(embed_dim[2], self.out_channels) + self.hardswish_len = Activation( + "hard_swish", inplace=True + ) # nn.Hardswish() + self.dropout_len = nn.Dropout(p=last_drop) + + torch.nn.init.xavier_normal_(self.pos_embed) + self.apply(self._init_weights) + + def _init_weights(self, m): + # weight initialization + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + def forward_features(self, x): + x = self.patch_embed(x) + x = x + self.pos_embed + x = self.pos_drop(x) + for blk in self.blocks1: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample1( + x.permute(0, 2, 1).reshape( + [-1, self.embed_dim[0], self.HW[0], self.HW[1]] + ) + ) + for blk in self.blocks2: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample2( + x.permute(0, 2, 1).reshape( + [-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]] + ) + ) + for blk in self.blocks3: + x = blk(x) + if not self.prenorm: + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + if self.use_lenhead: + len_x = self.len_conv(x.mean(1)) + len_x = self.dropout_len(self.hardswish_len(len_x)) + if self.last_stage: + if self.patch_merging is not None: + h = self.HW[0] // 4 + else: + h = self.HW[0] + x = self.avg_pool( + x.permute(0, 2, 1).reshape([-1, self.embed_dim[2], h, self.HW[1]]) + ) + x = self.last_conv(x) + x = self.hardswish(x) + x = self.dropout(x) + if self.use_lenhead: + return x, len_x + return x diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py new file mode 100644 index 0000000000000000000000000000000000000000..ec1b30ccb0a04888562a0207bbdfbed1d8da0add --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py @@ -0,0 +1,76 @@ +import torch +import torch.nn.functional as F +from torch import nn + + +class Hswish(nn.Module): + def __init__(self, inplace=True): + super(Hswish, self).__init__() + self.inplace = inplace + + def forward(self, x): + return x * F.relu6(x + 3.0, inplace=self.inplace) / 6.0 + + +# out = max(0, min(1, slop*x+offset)) +# paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None) +class Hsigmoid(nn.Module): + def __init__(self, inplace=True): + super(Hsigmoid, self).__init__() + self.inplace = inplace + + def forward(self, x): + # torch: F.relu6(x + 3., inplace=self.inplace) / 6. + # paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6. + return F.relu6(1.2 * x + 3.0, inplace=self.inplace) / 6.0 + + +class GELU(nn.Module): + def __init__(self, inplace=True): + super(GELU, self).__init__() + self.inplace = inplace + + def forward(self, x): + return torch.nn.functional.gelu(x) + + +class Swish(nn.Module): + def __init__(self, inplace=True): + super(Swish, self).__init__() + self.inplace = inplace + + def forward(self, x): + if self.inplace: + x.mul_(torch.sigmoid(x)) + return x + else: + return x * torch.sigmoid(x) + + +class Activation(nn.Module): + def __init__(self, act_type, inplace=True): + super(Activation, self).__init__() + act_type = act_type.lower() + if act_type == "relu": + self.act = nn.ReLU(inplace=inplace) + elif act_type == "relu6": + self.act = nn.ReLU6(inplace=inplace) + elif act_type == "sigmoid": + raise NotImplementedError + elif act_type == "hard_sigmoid": + self.act = Hsigmoid( + inplace + ) # nn.Hardsigmoid(inplace=inplace)#Hsigmoid(inplace)# + elif act_type == "hard_swish" or act_type == "hswish": + self.act = Hswish(inplace=inplace) + elif act_type == "leakyrelu": + self.act = nn.LeakyReLU(inplace=inplace) + elif act_type == "gelu": + self.act = GELU(inplace=inplace) + elif act_type == "swish": + self.act = Swish(inplace=inplace) + else: + raise NotImplementedError + + def forward(self, inputs): + return self.act(inputs) diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..00428c4374f8d69f8b59b40406bbb56cdf904dd3 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py @@ -0,0 +1,43 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["build_head"] + + +def build_head(config, **kwargs): + # det head + from .det_db_head import DBHead, PFHeadLocal + + # rec head + from .rec_ctc_head import CTCHead + from .rec_multi_head import MultiHead + + # cls head + from .cls_head import ClsHead + + support_dict = [ + "DBHead", + "CTCHead", + "ClsHead", + "MultiHead", + "PFHeadLocal", + ] + + module_name = config.pop("name") + char_num = config.pop("char_num", 6625) + assert module_name in support_dict, Exception( + "head only support {}".format(support_dict) + ) + module_class = eval(module_name)(**config, **kwargs) + return module_class diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py new file mode 100644 index 0000000000000000000000000000000000000000..9353b9ebb88c043ab31eedb4219b191eb88417da --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py @@ -0,0 +1,23 @@ +import torch +import torch.nn.functional as F +from torch import nn + + +class ClsHead(nn.Module): + """ + Class orientation + Args: + params(dict): super parameters for build Class network + """ + + def __init__(self, in_channels, class_dim, **kwargs): + super(ClsHead, self).__init__() + self.pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Linear(in_channels, class_dim, bias=True) + + def forward(self, x): + x = self.pool(x) + x = torch.reshape(x, shape=[x.shape[0], x.shape[1]]) + x = self.fc(x) + x = F.softmax(x, dim=1) + return x diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py new file mode 100644 index 0000000000000000000000000000000000000000..7c1196830829e6c788e5864861471977cdb47e25 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py @@ -0,0 +1,109 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from ..common import Activation +from ..backbones.det_mobilenet_v3 import ConvBNLayer + +class Head(nn.Module): + def __init__(self, in_channels, **kwargs): + super(Head, self).__init__() + self.conv1 = nn.Conv2d( + in_channels=in_channels, + out_channels=in_channels // 4, + kernel_size=3, + padding=1, + bias=False) + self.conv_bn1 = nn.BatchNorm2d( + in_channels // 4) + self.relu1 = Activation(act_type='relu') + + self.conv2 = nn.ConvTranspose2d( + in_channels=in_channels // 4, + out_channels=in_channels // 4, + kernel_size=2, + stride=2) + self.conv_bn2 = nn.BatchNorm2d( + in_channels // 4) + self.relu2 = Activation(act_type='relu') + + self.conv3 = nn.ConvTranspose2d( + in_channels=in_channels // 4, + out_channels=1, + kernel_size=2, + stride=2) + + def forward(self, x, return_f=False): + x = self.conv1(x) + x = self.conv_bn1(x) + x = self.relu1(x) + x = self.conv2(x) + x = self.conv_bn2(x) + x = self.relu2(x) + if return_f is True: + f = x + x = self.conv3(x) + x = torch.sigmoid(x) + if return_f is True: + return x, f + return x + + +class DBHead(nn.Module): + """ + Differentiable Binarization (DB) for text detection: + see https://arxiv.org/abs/1911.08947 + args: + params(dict): super parameters for build DB network + """ + + def __init__(self, in_channels, k=50, **kwargs): + super(DBHead, self).__init__() + self.k = k + binarize_name_list = [ + 'conv2d_56', 'batch_norm_47', 'conv2d_transpose_0', 'batch_norm_48', + 'conv2d_transpose_1', 'binarize' + ] + thresh_name_list = [ + 'conv2d_57', 'batch_norm_49', 'conv2d_transpose_2', 'batch_norm_50', + 'conv2d_transpose_3', 'thresh' + ] + self.binarize = Head(in_channels, **kwargs)# binarize_name_list) + self.thresh = Head(in_channels, **kwargs)#thresh_name_list) + + def step_function(self, x, y): + return torch.reciprocal(1 + torch.exp(-self.k * (x - y))) + + def forward(self, x): + shrink_maps = self.binarize(x) + return {'maps': shrink_maps} + + +class LocalModule(nn.Module): + def __init__(self, in_c, mid_c, use_distance=True): + super(self.__class__, self).__init__() + self.last_3 = ConvBNLayer(in_c + 1, mid_c, 3, 1, 1, act='relu') + self.last_1 = nn.Conv2d(mid_c, 1, 1, 1, 0) + + def forward(self, x, init_map, distance_map): + outf = torch.cat([init_map, x], dim=1) + # last Conv + out = self.last_1(self.last_3(outf)) + return out + +class PFHeadLocal(DBHead): + def __init__(self, in_channels, k=50, mode='small', **kwargs): + super(PFHeadLocal, self).__init__(in_channels, k, **kwargs) + self.mode = mode + + self.up_conv = nn.Upsample(scale_factor=2, mode="nearest") + if self.mode == 'large': + self.cbn_layer = LocalModule(in_channels // 4, in_channels // 4) + elif self.mode == 'small': + self.cbn_layer = LocalModule(in_channels // 4, in_channels // 8) + + def forward(self, x, targets=None): + shrink_maps, f = self.binarize(x, return_f=True) + base_maps = shrink_maps + cbn_maps = self.cbn_layer(self.up_conv(f), shrink_maps, None) + cbn_maps = F.sigmoid(cbn_maps) + return {'maps': 0.5 * (base_maps + cbn_maps), 'cbn_maps': cbn_maps} \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..42e2fabba48ce813b7736b2242eb117761a242bc --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py @@ -0,0 +1,54 @@ +import torch.nn.functional as F +from torch import nn + + +class CTCHead(nn.Module): + def __init__( + self, + in_channels, + out_channels=6625, + fc_decay=0.0004, + mid_channels=None, + return_feats=False, + **kwargs + ): + super(CTCHead, self).__init__() + if mid_channels is None: + self.fc = nn.Linear( + in_channels, + out_channels, + bias=True, + ) + else: + self.fc1 = nn.Linear( + in_channels, + mid_channels, + bias=True, + ) + self.fc2 = nn.Linear( + mid_channels, + out_channels, + bias=True, + ) + + self.out_channels = out_channels + self.mid_channels = mid_channels + self.return_feats = return_feats + + def forward(self, x, labels=None): + if self.mid_channels is None: + predicts = self.fc(x) + else: + x = self.fc1(x) + predicts = self.fc2(x) + + if self.return_feats: + result = (x, predicts) + else: + result = predicts + + if not self.training: + predicts = F.softmax(predicts, dim=2) + result = predicts + + return result diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a4807cbb0cde37024fac62a39c8dee7f75d6da1f --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py @@ -0,0 +1,58 @@ +from torch import nn + +from ..necks.rnn import Im2Seq, SequenceEncoder +from .rec_ctc_head import CTCHead + + +class FCTranspose(nn.Module): + def __init__(self, in_channels, out_channels, only_transpose=False): + super().__init__() + self.only_transpose = only_transpose + if not self.only_transpose: + self.fc = nn.Linear(in_channels, out_channels, bias=False) + + def forward(self, x): + if self.only_transpose: + return x.permute([0, 2, 1]) + else: + return self.fc(x.permute([0, 2, 1])) + + +class MultiHead(nn.Module): + def __init__(self, in_channels, out_channels_list, **kwargs): + super().__init__() + self.head_list = kwargs.pop("head_list") + + self.gtc_head = "sar" + assert len(self.head_list) >= 2 + for idx, head_name in enumerate(self.head_list): + name = list(head_name)[0] + if name == "SARHead": + pass + + elif name == "NRTRHead": + pass + elif name == "CTCHead": + # ctc neck + self.encoder_reshape = Im2Seq(in_channels) + neck_args = self.head_list[idx][name]["Neck"] + encoder_type = neck_args.pop("name") + self.ctc_encoder = SequenceEncoder( + in_channels=in_channels, encoder_type=encoder_type, **neck_args + ) + # ctc head + head_args = self.head_list[idx][name].get("Head", {}) + if head_args is None: + head_args = {} + + self.ctc_head = CTCHead( + in_channels=self.ctc_encoder.out_channels, + out_channels=out_channels_list["CTCLabelDecode"], + **head_args, + ) + else: + raise NotImplementedError(f"{name} is not supported in MultiHead yet") + + def forward(self, x, data=None): + ctc_encoder = self.ctc_encoder(x) + return self.ctc_head(ctc_encoder) diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bbe85bc6a59f8d03541cbeb0e7cff34c5ba6c2e5 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["build_neck"] + + +def build_neck(config): + from .db_fpn import DBFPN, LKPAN, RSEFPN + from .rnn import SequenceEncoder + + support_dict = ["DBFPN", "SequenceEncoder", "RSEFPN", "LKPAN"] + + module_name = config.pop("name") + assert module_name in support_dict, Exception( + "neck only support {}".format(support_dict) + ) + module_class = eval(module_name)(**config) + return module_class diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..9c8460a23a5816ba9ff8c6be6ed8fd31e4e697b2 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py @@ -0,0 +1,456 @@ +import torch +import torch.nn.functional as F +from torch import nn + +from ..backbones.det_mobilenet_v3 import SEModule +from ..necks.intracl import IntraCLBlock + + +def hard_swish(x, inplace=True): + return x * F.relu6(x + 3.0, inplace=inplace) / 6.0 + + +class DSConv(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + padding, + stride=1, + groups=None, + if_act=True, + act="relu", + **kwargs + ): + super(DSConv, self).__init__() + if groups == None: + groups = in_channels + self.if_act = if_act + self.act = act + self.conv1 = nn.Conv2d( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + ) + + self.bn1 = nn.BatchNorm2d(in_channels) + + self.conv2 = nn.Conv2d( + in_channels=in_channels, + out_channels=int(in_channels * 4), + kernel_size=1, + stride=1, + bias=False, + ) + + self.bn2 = nn.BatchNorm2d(int(in_channels * 4)) + + self.conv3 = nn.Conv2d( + in_channels=int(in_channels * 4), + out_channels=out_channels, + kernel_size=1, + stride=1, + bias=False, + ) + self._c = [in_channels, out_channels] + if in_channels != out_channels: + self.conv_end = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + bias=False, + ) + + def forward(self, inputs): + x = self.conv1(inputs) + x = self.bn1(x) + + x = self.conv2(x) + x = self.bn2(x) + if self.if_act: + if self.act == "relu": + x = F.relu(x) + elif self.act == "hardswish": + x = hard_swish(x) + else: + print( + "The activation function({}) is selected incorrectly.".format( + self.act + ) + ) + exit() + + x = self.conv3(x) + if self._c[0] != self._c[1]: + x = x + self.conv_end(inputs) + return x + + +class DBFPN(nn.Module): + def __init__(self, in_channels, out_channels, use_asf=False, **kwargs): + super(DBFPN, self).__init__() + self.out_channels = out_channels + self.use_asf = use_asf + + self.in2_conv = nn.Conv2d( + in_channels=in_channels[0], + out_channels=self.out_channels, + kernel_size=1, + bias=False, + ) + self.in3_conv = nn.Conv2d( + in_channels=in_channels[1], + out_channels=self.out_channels, + kernel_size=1, + bias=False, + ) + self.in4_conv = nn.Conv2d( + in_channels=in_channels[2], + out_channels=self.out_channels, + kernel_size=1, + bias=False, + ) + self.in5_conv = nn.Conv2d( + in_channels=in_channels[3], + out_channels=self.out_channels, + kernel_size=1, + bias=False, + ) + self.p5_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False, + ) + self.p4_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False, + ) + self.p3_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False, + ) + self.p2_conv = nn.Conv2d( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + bias=False, + ) + + if self.use_asf is True: + self.asf = ASFBlock(self.out_channels, self.out_channels // 4) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.in5_conv(c5) + in4 = self.in4_conv(c4) + in3 = self.in3_conv(c3) + in2 = self.in2_conv(c2) + + out4 = in4 + F.interpolate( + in5, + scale_factor=2, + mode="nearest", + ) # align_mode=1) # 1/16 + out3 = in3 + F.interpolate( + out4, + scale_factor=2, + mode="nearest", + ) # align_mode=1) # 1/8 + out2 = in2 + F.interpolate( + out3, + scale_factor=2, + mode="nearest", + ) # align_mode=1) # 1/4 + + p5 = self.p5_conv(in5) + p4 = self.p4_conv(out4) + p3 = self.p3_conv(out3) + p2 = self.p2_conv(out2) + p5 = F.interpolate( + p5, + scale_factor=8, + mode="nearest", + ) # align_mode=1) + p4 = F.interpolate( + p4, + scale_factor=4, + mode="nearest", + ) # align_mode=1) + p3 = F.interpolate( + p3, + scale_factor=2, + mode="nearest", + ) # align_mode=1) + + fuse = torch.cat([p5, p4, p3, p2], dim=1) + + if self.use_asf is True: + fuse = self.asf(fuse, [p5, p4, p3, p2]) + + return fuse + + +class RSELayer(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, shortcut=True): + super(RSELayer, self).__init__() + self.out_channels = out_channels + self.in_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=self.out_channels, + kernel_size=kernel_size, + padding=int(kernel_size // 2), + bias=False, + ) + self.se_block = SEModule(self.out_channels) + self.shortcut = shortcut + + def forward(self, ins): + x = self.in_conv(ins) + if self.shortcut: + out = x + self.se_block(x) + else: + out = self.se_block(x) + return out + + +class RSEFPN(nn.Module): + def __init__(self, in_channels, out_channels, shortcut=True, **kwargs): + super(RSEFPN, self).__init__() + self.out_channels = out_channels + self.ins_conv = nn.ModuleList() + self.inp_conv = nn.ModuleList() + self.intracl = False + if "intracl" in kwargs.keys() and kwargs["intracl"] is True: + self.intracl = kwargs["intracl"] + self.incl1 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl2 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl3 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl4 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + + for i in range(len(in_channels)): + self.ins_conv.append( + RSELayer(in_channels[i], out_channels, kernel_size=1, shortcut=shortcut) + ) + self.inp_conv.append( + RSELayer( + out_channels, out_channels // 4, kernel_size=3, shortcut=shortcut + ) + ) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.ins_conv[3](c5) + in4 = self.ins_conv[2](c4) + in3 = self.ins_conv[1](c3) + in2 = self.ins_conv[0](c2) + + out4 = in4 + F.interpolate(in5, scale_factor=2, mode="nearest") # 1/16 + out3 = in3 + F.interpolate(out4, scale_factor=2, mode="nearest") # 1/8 + out2 = in2 + F.interpolate(out3, scale_factor=2, mode="nearest") # 1/4 + + p5 = self.inp_conv[3](in5) + p4 = self.inp_conv[2](out4) + p3 = self.inp_conv[1](out3) + p2 = self.inp_conv[0](out2) + + if self.intracl is True: + p5 = self.incl4(p5) + p4 = self.incl3(p4) + p3 = self.incl2(p3) + p2 = self.incl1(p2) + + p5 = F.interpolate(p5, scale_factor=8, mode="nearest") + p4 = F.interpolate(p4, scale_factor=4, mode="nearest") + p3 = F.interpolate(p3, scale_factor=2, mode="nearest") + + fuse = torch.cat([p5, p4, p3, p2], dim=1) + return fuse + + +class LKPAN(nn.Module): + def __init__(self, in_channels, out_channels, mode="large", **kwargs): + super(LKPAN, self).__init__() + self.out_channels = out_channels + + self.ins_conv = nn.ModuleList() + self.inp_conv = nn.ModuleList() + # pan head + self.pan_head_conv = nn.ModuleList() + self.pan_lat_conv = nn.ModuleList() + + if mode.lower() == "lite": + p_layer = DSConv + elif mode.lower() == "large": + p_layer = nn.Conv2d + else: + raise ValueError( + "mode can only be one of ['lite', 'large'], but received {}".format( + mode + ) + ) + + for i in range(len(in_channels)): + self.ins_conv.append( + nn.Conv2d( + in_channels=in_channels[i], + out_channels=self.out_channels, + kernel_size=1, + bias=False, + ) + ) + + self.inp_conv.append( + p_layer( + in_channels=self.out_channels, + out_channels=self.out_channels // 4, + kernel_size=9, + padding=4, + bias=False, + ) + ) + + if i > 0: + self.pan_head_conv.append( + nn.Conv2d( + in_channels=self.out_channels // 4, + out_channels=self.out_channels // 4, + kernel_size=3, + padding=1, + stride=2, + bias=False, + ) + ) + self.pan_lat_conv.append( + p_layer( + in_channels=self.out_channels // 4, + out_channels=self.out_channels // 4, + kernel_size=9, + padding=4, + bias=False, + ) + ) + self.intracl = False + if "intracl" in kwargs.keys() and kwargs["intracl"] is True: + self.intracl = kwargs["intracl"] + self.incl1 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl2 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl3 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + self.incl4 = IntraCLBlock(self.out_channels // 4, reduce_factor=2) + + def forward(self, x): + c2, c3, c4, c5 = x + + in5 = self.ins_conv[3](c5) + in4 = self.ins_conv[2](c4) + in3 = self.ins_conv[1](c3) + in2 = self.ins_conv[0](c2) + + out4 = in4 + F.interpolate(in5, scale_factor=2, mode="nearest") # 1/16 + out3 = in3 + F.interpolate(out4, scale_factor=2, mode="nearest") # 1/8 + out2 = in2 + F.interpolate(out3, scale_factor=2, mode="nearest") # 1/4 + + f5 = self.inp_conv[3](in5) + f4 = self.inp_conv[2](out4) + f3 = self.inp_conv[1](out3) + f2 = self.inp_conv[0](out2) + + pan3 = f3 + self.pan_head_conv[0](f2) + pan4 = f4 + self.pan_head_conv[1](pan3) + pan5 = f5 + self.pan_head_conv[2](pan4) + + p2 = self.pan_lat_conv[0](f2) + p3 = self.pan_lat_conv[1](pan3) + p4 = self.pan_lat_conv[2](pan4) + p5 = self.pan_lat_conv[3](pan5) + + if self.intracl is True: + p5 = self.incl4(p5) + p4 = self.incl3(p4) + p3 = self.incl2(p3) + p2 = self.incl1(p2) + + p5 = F.interpolate(p5, scale_factor=8, mode="nearest") + p4 = F.interpolate(p4, scale_factor=4, mode="nearest") + p3 = F.interpolate(p3, scale_factor=2, mode="nearest") + + fuse = torch.cat([p5, p4, p3, p2], dim=1) + return fuse + + +class ASFBlock(nn.Module): + """ + This code is refered from: + https://github.com/MhLiao/DB/blob/master/decoders/feature_attention.py + """ + + def __init__(self, in_channels, inter_channels, out_features_num=4): + """ + Adaptive Scale Fusion (ASF) block of DBNet++ + Args: + in_channels: the number of channels in the input data + inter_channels: the number of middle channels + out_features_num: the number of fused stages + """ + super(ASFBlock, self).__init__() + self.in_channels = in_channels + self.inter_channels = inter_channels + self.out_features_num = out_features_num + self.conv = nn.Conv2d(in_channels, inter_channels, 3, padding=1) + + self.spatial_scale = nn.Sequential( + # Nx1xHxW + nn.Conv2d( + in_channels=1, + out_channels=1, + kernel_size=3, + bias=False, + padding=1, + ), + nn.ReLU(), + nn.Conv2d( + in_channels=1, + out_channels=1, + kernel_size=1, + bias=False, + ), + nn.Sigmoid(), + ) + + self.channel_scale = nn.Sequential( + nn.Conv2d( + in_channels=inter_channels, + out_channels=out_features_num, + kernel_size=1, + bias=False, + ), + nn.Sigmoid(), + ) + + def forward(self, fuse_features, features_list): + fuse_features = self.conv(fuse_features) + spatial_x = torch.mean(fuse_features, dim=1, keepdim=True) + attention_scores = self.spatial_scale(spatial_x) + fuse_features + attention_scores = self.channel_scale(attention_scores) + assert len(features_list) == self.out_features_num + + out_list = [] + for i in range(self.out_features_num): + out_list.append(attention_scores[:, i : i + 1] * features_list[i]) + return torch.cat(out_list, dim=1) diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py new file mode 100644 index 0000000000000000000000000000000000000000..0ba85fa8086ff013491ef66beca49e0ee8475f2c --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py @@ -0,0 +1,117 @@ +from torch import nn + + +class IntraCLBlock(nn.Module): + def __init__(self, in_channels=96, reduce_factor=4): + super(IntraCLBlock, self).__init__() + self.channels = in_channels + self.rf = reduce_factor + self.conv1x1_reduce_channel = nn.Conv2d( + self.channels, self.channels // self.rf, kernel_size=1, stride=1, padding=0 + ) + self.conv1x1_return_channel = nn.Conv2d( + self.channels // self.rf, self.channels, kernel_size=1, stride=1, padding=0 + ) + + self.v_layer_7x1 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(7, 1), + stride=(1, 1), + padding=(3, 0), + ) + self.v_layer_5x1 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(5, 1), + stride=(1, 1), + padding=(2, 0), + ) + self.v_layer_3x1 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(3, 1), + stride=(1, 1), + padding=(1, 0), + ) + + self.q_layer_1x7 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(1, 7), + stride=(1, 1), + padding=(0, 3), + ) + self.q_layer_1x5 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(1, 5), + stride=(1, 1), + padding=(0, 2), + ) + self.q_layer_1x3 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(1, 3), + stride=(1, 1), + padding=(0, 1), + ) + + # base + self.c_layer_7x7 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(7, 7), + stride=(1, 1), + padding=(3, 3), + ) + self.c_layer_5x5 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(5, 5), + stride=(1, 1), + padding=(2, 2), + ) + self.c_layer_3x3 = nn.Conv2d( + self.channels // self.rf, + self.channels // self.rf, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + ) + + self.bn = nn.BatchNorm2d(self.channels) + self.relu = nn.ReLU() + + def forward(self, x): + x_new = self.conv1x1_reduce_channel(x) + + x_7_c = self.c_layer_7x7(x_new) + x_7_v = self.v_layer_7x1(x_new) + x_7_q = self.q_layer_1x7(x_new) + x_7 = x_7_c + x_7_v + x_7_q + + x_5_c = self.c_layer_5x5(x_7) + x_5_v = self.v_layer_5x1(x_7) + x_5_q = self.q_layer_1x5(x_7) + x_5 = x_5_c + x_5_v + x_5_q + + x_3_c = self.c_layer_3x3(x_5) + x_3_v = self.v_layer_3x1(x_5) + x_3_q = self.q_layer_1x3(x_5) + x_3 = x_3_c + x_3_v + x_3_q + + x_relation = self.conv1x1_return_channel(x_3) + + x_relation = self.bn(x_relation) + x_relation = self.relu(x_relation) + + return x + x_relation + + +def build_intraclblock_list(num_block): + IntraCLBlock_list = nn.ModuleList() + for i in range(num_block): + IntraCLBlock_list.append(IntraCLBlock()) + + return IntraCLBlock_list diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py new file mode 100644 index 0000000000000000000000000000000000000000..f59e9ddd27ec500347cf753cba22e5934f776bf5 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py @@ -0,0 +1,241 @@ +import torch +from torch import nn + +from ..backbones.rec_svtrnet import Block, ConvBNLayer + + +class Im2Seq(nn.Module): + def __init__(self, in_channels, **kwargs): + super().__init__() + self.out_channels = in_channels + + def forward(self, x): + B, C, H, W = x.shape + # assert H == 1 + x = x.squeeze(dim=2) + # x = x.transpose([0, 2, 1]) # paddle (NTC)(batch, width, channels) + x = x.permute(0, 2, 1) + return x + + # def forward(self, x): + # B, C, H, W = x.shape + # # 处理四维张量,将空间维度展平为序列 + # if H == 1: + # # 原来的处理逻辑,适用于H=1的情况 + # x = x.squeeze(dim=2) + # x = x.permute(0, 2, 1) # (B, W, C) + # else: + # # 处理H不为1的情况 + # x = x.permute(0, 2, 3, 1) # (B, H, W, C) + # x = x.reshape(B, H * W, C) # (B, H*W, C) + # + # return x + +class EncoderWithRNN_(nn.Module): + def __init__(self, in_channels, hidden_size): + super(EncoderWithRNN_, self).__init__() + self.out_channels = hidden_size * 2 + self.rnn1 = nn.LSTM( + in_channels, + hidden_size, + bidirectional=False, + batch_first=True, + num_layers=2, + ) + self.rnn2 = nn.LSTM( + in_channels, + hidden_size, + bidirectional=False, + batch_first=True, + num_layers=2, + ) + + def forward(self, x): + self.rnn1.flatten_parameters() + self.rnn2.flatten_parameters() + out1, h1 = self.rnn1(x) + out2, h2 = self.rnn2(torch.flip(x, [1])) + return torch.cat([out1, torch.flip(out2, [1])], 2) + + +class EncoderWithRNN(nn.Module): + def __init__(self, in_channels, hidden_size): + super(EncoderWithRNN, self).__init__() + self.out_channels = hidden_size * 2 + self.lstm = nn.LSTM( + in_channels, hidden_size, num_layers=2, batch_first=True, bidirectional=True + ) # batch_first:=True + + def forward(self, x): + x, _ = self.lstm(x) + return x + + +class EncoderWithFC(nn.Module): + def __init__(self, in_channels, hidden_size): + super(EncoderWithFC, self).__init__() + self.out_channels = hidden_size + self.fc = nn.Linear( + in_channels, + hidden_size, + bias=True, + ) + + def forward(self, x): + x = self.fc(x) + return x + + +class EncoderWithSVTR(nn.Module): + def __init__( + self, + in_channels, + dims=64, # XS + depth=2, + hidden_dims=120, + use_guide=False, + num_heads=8, + qkv_bias=True, + mlp_ratio=2.0, + drop_rate=0.1, + kernel_size=[3, 3], + attn_drop_rate=0.1, + drop_path=0.0, + qk_scale=None, + ): + super(EncoderWithSVTR, self).__init__() + self.depth = depth + self.use_guide = use_guide + self.conv1 = ConvBNLayer( + in_channels, + in_channels // 8, + kernel_size=kernel_size, + padding=[kernel_size[0] // 2, kernel_size[1] // 2], + act="swish", + ) + self.conv2 = ConvBNLayer( + in_channels // 8, hidden_dims, kernel_size=1, act="swish" + ) + + self.svtr_block = nn.ModuleList( + [ + Block( + dim=hidden_dims, + num_heads=num_heads, + mixer="Global", + HW=None, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer="swish", + attn_drop=attn_drop_rate, + drop_path=drop_path, + norm_layer="nn.LayerNorm", + epsilon=1e-05, + prenorm=False, + ) + for i in range(depth) + ] + ) + self.norm = nn.LayerNorm(hidden_dims, eps=1e-6) + self.conv3 = ConvBNLayer(hidden_dims, in_channels, kernel_size=1, act="swish") + # last conv-nxn, the input is concat of input tensor and conv3 output tensor + self.conv4 = ConvBNLayer( + 2 * in_channels, in_channels // 8, padding=1, act="swish" + ) + + self.conv1x1 = ConvBNLayer(in_channels // 8, dims, kernel_size=1, act="swish") + self.out_channels = dims + self.apply(self._init_weights) + + def _init_weights(self, m): + # weight initialization + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.kaiming_normal_(m.weight, mode="fan_out") + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + def forward(self, x): + # for use guide + if self.use_guide: + z = x.clone() + z.stop_gradient = True + else: + z = x + # for short cut + h = z + # reduce dim + z = self.conv1(z) + z = self.conv2(z) + # SVTR global block + B, C, H, W = z.shape + z = z.flatten(2).permute(0, 2, 1) + + for blk in self.svtr_block: + z = blk(z) + + z = self.norm(z) + # last stage + z = z.reshape([-1, H, W, C]).permute(0, 3, 1, 2) + z = self.conv3(z) + z = torch.cat((h, z), dim=1) + z = self.conv1x1(self.conv4(z)) + + return z + + +class SequenceEncoder(nn.Module): + def __init__(self, in_channels, encoder_type, hidden_size=48, **kwargs): + super(SequenceEncoder, self).__init__() + self.encoder_reshape = Im2Seq(in_channels) + self.out_channels = self.encoder_reshape.out_channels + self.encoder_type = encoder_type + if encoder_type == "reshape": + self.only_reshape = True + else: + support_encoder_dict = { + "reshape": Im2Seq, + "fc": EncoderWithFC, + "rnn": EncoderWithRNN, + "svtr": EncoderWithSVTR, + } + assert encoder_type in support_encoder_dict, "{} must in {}".format( + encoder_type, support_encoder_dict.keys() + ) + + if encoder_type == "svtr": + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, **kwargs + ) + else: + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels, hidden_size + ) + self.out_channels = self.encoder.out_channels + self.only_reshape = False + + def forward(self, x): + if self.encoder_type != "svtr": + x = self.encoder_reshape(x) + if not self.only_reshape: + x = self.encoder(x) + return x + else: + x = self.encoder(x) + x = self.encoder_reshape(x) + return x diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..40603ade8895fb995e97310ff75e7e67696bd52b --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py @@ -0,0 +1,33 @@ + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import copy + +__all__ = ['build_post_process'] + + +def build_post_process(config, global_config=None): + from .db_postprocess import DBPostProcess + from .rec_postprocess import CTCLabelDecode, AttnLabelDecode, SRNLabelDecode, TableLabelDecode, \ + NRTRLabelDecode, SARLabelDecode, ViTSTRLabelDecode, RFLLabelDecode + from .cls_postprocess import ClsPostProcess + from .rec_postprocess import CANLabelDecode + + support_dict = [ + 'DBPostProcess', 'CTCLabelDecode', + 'AttnLabelDecode', 'ClsPostProcess', 'SRNLabelDecode', + 'TableLabelDecode', 'NRTRLabelDecode', 'SARLabelDecode', + 'ViTSTRLabelDecode','CANLabelDecode', 'RFLLabelDecode' + ] + + config = copy.deepcopy(config) + module_name = config.pop('name') + if global_config is not None: + config.update(global_config) + assert module_name in support_dict, Exception( + 'post process only support {}, but got {}'.format(support_dict, module_name)) + module_class = eval(module_name)(**config) + return module_class \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py new file mode 100755 index 0000000000000000000000000000000000000000..c9c6affce380d827090faf67b0e63cde1cdd00fd --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py @@ -0,0 +1,20 @@ +import torch + + +class ClsPostProcess(object): + """ Convert between text-label and text-index """ + + def __init__(self, label_list, **kwargs): + super(ClsPostProcess, self).__init__() + self.label_list = label_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, torch.Tensor): + preds = preds.cpu().numpy() + pred_idxs = preds.argmax(axis=1) + decode_out = [(self.label_list[idx], preds[i, idx]) + for i, idx in enumerate(pred_idxs)] + if label is None: + return decode_out + label = [(self.label_list[idx], 1.0) for idx in label] + return decode_out, label \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py new file mode 100755 index 0000000000000000000000000000000000000000..7d1a56502ab4704a666403ffb21e4e696902ec93 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py @@ -0,0 +1,179 @@ +""" +This code is refered from: +https://github.com/WenmuZhou/DBNet.pytorch/blob/master/post_processing/seg_detector_representer.py +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import cv2 +import torch +from shapely.geometry import Polygon +import pyclipper + + +class DBPostProcess(object): + """ + The post process for Differentiable Binarization (DB). + """ + + def __init__(self, + thresh=0.3, + box_thresh=0.7, + max_candidates=1000, + unclip_ratio=2.0, + use_dilation=False, + score_mode="fast", + **kwargs): + self.thresh = thresh + self.box_thresh = box_thresh + self.max_candidates = max_candidates + self.unclip_ratio = unclip_ratio + self.min_size = 3 + self.score_mode = score_mode + assert score_mode in [ + "slow", "fast" + ], "Score mode must be in [slow, fast] but got: {}".format(score_mode) + + self.dilation_kernel = None if not use_dilation else np.array( + [[1, 1], [1, 1]]) + + def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height): + ''' + _bitmap: single map with shape (1, H, W), + whose values are binarized as {0, 1} + ''' + + bitmap = _bitmap + height, width = bitmap.shape + + outs = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, + cv2.CHAIN_APPROX_SIMPLE) + if len(outs) == 3: + img, contours, _ = outs[0], outs[1], outs[2] + elif len(outs) == 2: + contours, _ = outs[0], outs[1] + + num_contours = min(len(contours), self.max_candidates) + + boxes = [] + scores = [] + for index in range(num_contours): + contour = contours[index] + points, sside = self.get_mini_boxes(contour) + if sside < self.min_size: + continue + points = np.array(points) + if self.score_mode == "fast": + score = self.box_score_fast(pred, points.reshape(-1, 2)) + else: + score = self.box_score_slow(pred, contour) + if self.box_thresh > score: + continue + + box = self.unclip(points).reshape(-1, 1, 2) + box, sside = self.get_mini_boxes(box) + if sside < self.min_size + 2: + continue + box = np.array(box) + + box[:, 0] = np.clip( + np.round(box[:, 0] / width * dest_width), 0, dest_width) + box[:, 1] = np.clip( + np.round(box[:, 1] / height * dest_height), 0, dest_height) + boxes.append(box.astype(np.int16)) + scores.append(score) + return np.array(boxes, dtype=np.int16), scores + + def unclip(self, box): + unclip_ratio = self.unclip_ratio + poly = Polygon(box) + distance = poly.area * unclip_ratio / poly.length + offset = pyclipper.PyclipperOffset() + offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON) + expanded = np.array(offset.Execute(distance)) + return expanded + + def get_mini_boxes(self, contour): + bounding_box = cv2.minAreaRect(contour) + points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0]) + + index_1, index_2, index_3, index_4 = 0, 1, 2, 3 + if points[1][1] > points[0][1]: + index_1 = 0 + index_4 = 1 + else: + index_1 = 1 + index_4 = 0 + if points[3][1] > points[2][1]: + index_2 = 2 + index_3 = 3 + else: + index_2 = 3 + index_3 = 2 + + box = [ + points[index_1], points[index_2], points[index_3], points[index_4] + ] + return box, min(bounding_box[1]) + + def box_score_fast(self, bitmap, _box): + ''' + box_score_fast: use bbox mean score as the mean score + ''' + h, w = bitmap.shape[:2] + box = _box.copy() + xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int if 'int' in np.__dict__ else np.int32), 0, w - 1) + xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int if 'int' in np.__dict__ else np.int32), 0, w - 1) + ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int if 'int' in np.__dict__ else np.int32), 0, h - 1) + ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int if 'int' in np.__dict__ else np.int32), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + box[:, 0] = box[:, 0] - xmin + box[:, 1] = box[:, 1] - ymin + cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def box_score_slow(self, bitmap, contour): + ''' + box_score_slow: use polyon mean score as the mean score + ''' + h, w = bitmap.shape[:2] + contour = contour.copy() + contour = np.reshape(contour, (-1, 2)) + + xmin = np.clip(np.min(contour[:, 0]), 0, w - 1) + xmax = np.clip(np.max(contour[:, 0]), 0, w - 1) + ymin = np.clip(np.min(contour[:, 1]), 0, h - 1) + ymax = np.clip(np.max(contour[:, 1]), 0, h - 1) + + mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8) + + contour[:, 0] = contour[:, 0] - xmin + contour[:, 1] = contour[:, 1] - ymin + + cv2.fillPoly(mask, contour.reshape(1, -1, 2).astype(np.int32), 1) + return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0] + + def __call__(self, outs_dict, shape_list): + pred = outs_dict['maps'] + if isinstance(pred, torch.Tensor): + pred = pred.cpu().numpy() + pred = pred[:, 0, :, :] + segmentation = pred > self.thresh + + boxes_batch = [] + for batch_index in range(pred.shape[0]): + src_h, src_w, ratio_h, ratio_w = shape_list[batch_index] + if self.dilation_kernel is not None: + mask = cv2.dilate( + np.array(segmentation[batch_index]).astype(np.uint8), + self.dilation_kernel) + else: + mask = segmentation[batch_index] + boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask, + src_w, src_h) + + boxes_batch.append({'points': boxes}) + return boxes_batch \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py new file mode 100755 index 0000000000000000000000000000000000000000..467f51ece2d3b710a0e0d59280c00fff07850498 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py @@ -0,0 +1,792 @@ +# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +import numpy as np +import torch + + +class BaseRecLabelDecode(object): + """ Convert between text-label and text-index """ + + def __init__(self, + character_dict_path=None, + use_space_char=False): + + self.beg_str = "sos" + self.end_str = "eos" + self.reverse = False + self.character_str = [] + + if character_dict_path is None: + self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz" + dict_character = list(self.character_str) + else: + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode('utf-8').strip("\n").strip("\r\n") + self.character_str.append(line) + if use_space_char: + self.character_str.append(" ") + dict_character = list(self.character_str) + if "arabic" in character_dict_path: + self.reverse = True + + dict_character = self.add_special_char(dict_character) + self.dict = {} + for i, char in enumerate(dict_character): + self.dict[char] = i + self.character = dict_character + + def pred_reverse(self, pred): + pred_re = [] + c_current = "" + for c in pred: + if not bool(re.search("[a-zA-Z0-9 :*./%+-]", c)): + if c_current != "": + pred_re.append(c_current) + pred_re.append(c) + c_current = "" + else: + c_current += c + if c_current != "": + pred_re.append(c_current) + + return "".join(pred_re[::-1]) + + def add_special_char(self, dict_character): + return dict_character + + def get_word_info(self, text, selection): + """ + Group the decoded characters and record the corresponding decoded positions. + + Args: + text: the decoded text + selection: the bool array that identifies which columns of features are decoded as non-separated characters + Returns: + word_list: list of the grouped words + word_col_list: list of decoding positions corresponding to each character in the grouped word + state_list: list of marker to identify the type of grouping words, including two types of grouping words: + - 'cn': continuous chinese characters (e.g., 你好啊) + - 'en&num': continuous english characters (e.g., hello), number (e.g., 123, 1.123), or mixed of them connected by '-' (e.g., VGG-16) + The remaining characters in text are treated as separators between groups (e.g., space, '(', ')', etc.). + """ + state = None + word_content = [] + word_col_content = [] + word_list = [] + word_col_list = [] + state_list = [] + valid_col = np.where(selection == True)[0] + + for c_i, char in enumerate(text): + if "\u4e00" <= char <= "\u9fff": + c_state = "cn" + elif bool(re.search("[a-zA-Z0-9]", char)): + c_state = "en&num" + else: + c_state = "splitter" + + if ( + char == "." + and state == "en&num" + and c_i + 1 < len(text) + and bool(re.search("[0-9]", text[c_i + 1])) + ): # grouping floating number + c_state = "en&num" + if ( + char == "-" and state == "en&num" + ): # grouping word with '-', such as 'state-of-the-art' + c_state = "en&num" + + if state == None: + state = c_state + + if state != c_state: + if len(word_content) != 0: + word_list.append(word_content) + word_col_list.append(word_col_content) + state_list.append(state) + word_content = [] + word_col_content = [] + state = c_state + + if state != "splitter": + word_content.append(char) + word_col_content.append(valid_col[c_i]) + + if len(word_content) != 0: + word_list.append(word_content) + word_col_list.append(word_col_content) + state_list.append(state) + + return word_list, word_col_list, state_list + + def decode( + self, + text_index, + text_prob=None, + is_remove_duplicate=False, + return_word_box=False, + ): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list))) + return result_list + + def get_ignored_tokens(self): + return [0] # for ctc blank + + +class CTCLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(CTCLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, return_word_box=False, *args, **kwargs): + if isinstance(preds, torch.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode( + preds_idx, + preds_prob, + is_remove_duplicate=True, + return_word_box=return_word_box, + ) + if return_word_box: + for rec_idx, rec in enumerate(text): + wh_ratio = kwargs["wh_ratio_list"][rec_idx] + max_wh_ratio = kwargs["max_wh_ratio"] + rec[2][0] = rec[2][0] * (wh_ratio / max_wh_ratio) + + if label is None: + return text + label = self.decode(label) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank'] + dict_character + return dict_character + + +class NRTRLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=True, **kwargs): + super(NRTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + + if len(preds) == 2: + preds_id = preds[0] + preds_prob = preds[1] + if isinstance(preds_id, torch.Tensor): + preds_id = preds_id.numpy() + if isinstance(preds_prob, torch.Tensor): + preds_prob = preds_prob.numpy() + if preds_id[0][0] == 2: + preds_idx = preds_id[:, 1:] + preds_prob = preds_prob[:, 1:] + else: + preds_idx = preds_id + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + else: + if isinstance(preds, torch.Tensor): + preds = preds.numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['blank', '', '', ''] + dict_character + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + try: + char_idx = self.character[int(text_index[batch_idx][idx])] + except: + continue + if char_idx == '': # end + break + char_list.append(char_idx) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text.lower(), np.mean(conf_list).tolist())) + return result_list + +class ViTSTRLabelDecode(NRTRLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(ViTSTRLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, torch.Tensor): + preds = preds[:, 1:].numpy() + else: + preds = preds[:, 1:] + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label[:, 1:]) + return text, label + + def add_special_char(self, dict_character): + dict_character = ['', ''] + dict_character + return dict_character + + +class AttnLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + **kwargs): + super(AttnLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + [beg_idx, end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list))) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + """ + text = self.decode(text) + if label is None: + return text + else: + label = self.decode(label, is_remove_duplicate=False) + return text, label + """ + if isinstance(preds, torch.Tensor): + preds = preds.cpu().numpy() + + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class RFLLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(RFLLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def add_special_char(self, dict_character): + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + dict_character + [self.end_str] + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + [beg_idx, end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + # if seq_outputs is not None: + if isinstance(preds, tuple) or isinstance(preds, list): + cnt_outputs, seq_outputs = preds + if isinstance(seq_outputs, torch.Tensor): + seq_outputs = seq_outputs.numpy() + preds_idx = seq_outputs.argmax(axis=2) + preds_prob = seq_outputs.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + else: + cnt_outputs = preds + if isinstance(cnt_outputs, torch.Tensor): + cnt_outputs = cnt_outputs.numpy() + cnt_length = [] + for lens in cnt_outputs: + length = round(np.sum(lens)) + cnt_length.append(length) + if label is None: + return cnt_length + label = self.decode(label, is_remove_duplicate=False) + length = [len(res[0]) for res in label] + return cnt_length, length + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class SRNLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, + character_dict_path=None, + use_space_char=False, + **kwargs): + self.max_text_length = kwargs.get('max_text_length', 25) + super(SRNLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def __call__(self, preds, label=None, *args, **kwargs): + pred = preds['predict'] + char_num = len(self.character_str) + 2 + if isinstance(pred, torch.Tensor): + pred = pred.numpy() + pred = np.reshape(pred, [-1, char_num]) + + preds_idx = np.argmax(pred, axis=1) + preds_prob = np.max(pred, axis=1) + + preds_idx = np.reshape(preds_idx, [-1, self.max_text_length]) + + preds_prob = np.reshape(preds_prob, [-1, self.max_text_length]) + + text = self.decode(preds_idx, preds_prob) + + if label is None: + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + return text + label = self.decode(label) + return text, label + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + batch_size = len(text_index) + + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list))) + return result_list + + def add_special_char(self, dict_character): + dict_character = dict_character + [self.beg_str, self.end_str] + return dict_character + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end): + if beg_or_end == "beg": + idx = np.array(self.dict[self.beg_str]) + elif beg_or_end == "end": + idx = np.array(self.dict[self.end_str]) + else: + assert False, "unsupport type %s in get_beg_end_flag_idx" \ + % beg_or_end + return idx + + +class TableLabelDecode(object): + """ """ + + def __init__(self, + character_dict_path, + **kwargs): + list_character, list_elem = self.load_char_elem_dict(character_dict_path) + list_character = self.add_special_char(list_character) + list_elem = self.add_special_char(list_elem) + self.dict_character = {} + self.dict_idx_character = {} + for i, char in enumerate(list_character): + self.dict_idx_character[i] = char + self.dict_character[char] = i + self.dict_elem = {} + self.dict_idx_elem = {} + for i, elem in enumerate(list_elem): + self.dict_idx_elem[i] = elem + self.dict_elem[elem] = i + + def load_char_elem_dict(self, character_dict_path): + list_character = [] + list_elem = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + substr = lines[0].decode('utf-8').strip("\n").strip("\r\n").split("\t") + character_num = int(substr[0]) + elem_num = int(substr[1]) + for cno in range(1, 1 + character_num): + character = lines[cno].decode('utf-8').strip("\n").strip("\r\n") + list_character.append(character) + for eno in range(1 + character_num, 1 + character_num + elem_num): + elem = lines[eno].decode('utf-8').strip("\n").strip("\r\n") + list_elem.append(elem) + return list_character, list_elem + + def add_special_char(self, list_character): + self.beg_str = "sos" + self.end_str = "eos" + list_character = [self.beg_str] + list_character + [self.end_str] + return list_character + + def __call__(self, preds): + structure_probs = preds['structure_probs'] + loc_preds = preds['loc_preds'] + if isinstance(structure_probs,torch.Tensor): + structure_probs = structure_probs.numpy() + if isinstance(loc_preds,torch.Tensor): + loc_preds = loc_preds.numpy() + structure_idx = structure_probs.argmax(axis=2) + structure_probs = structure_probs.max(axis=2) + structure_str, structure_pos, result_score_list, result_elem_idx_list = self.decode(structure_idx, + structure_probs, 'elem') + res_html_code_list = [] + res_loc_list = [] + batch_num = len(structure_str) + for bno in range(batch_num): + res_loc = [] + for sno in range(len(structure_str[bno])): + text = structure_str[bno][sno] + if text in ['', ' 0 and tmp_elem_idx == end_idx: + break + if tmp_elem_idx in ignored_tokens: + continue + + char_list.append(current_dict[tmp_elem_idx]) + elem_pos_list.append(idx) + score_list.append(structure_probs[batch_idx, idx]) + elem_idx_list.append(tmp_elem_idx) + result_list.append(char_list) + result_pos_list.append(elem_pos_list) + result_score_list.append(score_list) + result_elem_idx_list.append(elem_idx_list) + return result_list, result_pos_list, result_score_list, result_elem_idx_list + + def get_ignored_tokens(self, char_or_elem): + beg_idx = self.get_beg_end_flag_idx("beg", char_or_elem) + end_idx = self.get_beg_end_flag_idx("end", char_or_elem) + return [beg_idx, end_idx] + + def get_beg_end_flag_idx(self, beg_or_end, char_or_elem): + if char_or_elem == "char": + if beg_or_end == "beg": + idx = self.dict_character[self.beg_str] + elif beg_or_end == "end": + idx = self.dict_character[self.end_str] + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx of char" \ + % beg_or_end + elif char_or_elem == "elem": + if beg_or_end == "beg": + idx = self.dict_elem[self.beg_str] + elif beg_or_end == "end": + idx = self.dict_elem[self.end_str] + else: + assert False, "Unsupport type %s in get_beg_end_flag_idx of elem" \ + % beg_or_end + else: + assert False, "Unsupport type %s in char_or_elem" \ + % char_or_elem + return idx + + +class SARLabelDecode(BaseRecLabelDecode): + """ Convert between text-label and text-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(SARLabelDecode, self).__init__(character_dict_path, + use_space_char) + + self.rm_symbol = kwargs.get('rm_symbol', False) + + def add_special_char(self, dict_character): + beg_end_str = "" + unknown_str = "" + padding_str = "" + dict_character = dict_character + [unknown_str] + self.unknown_idx = len(dict_character) - 1 + dict_character = dict_character + [beg_end_str] + self.start_idx = len(dict_character) - 1 + self.end_idx = len(dict_character) - 1 + dict_character = dict_character + [padding_str] + self.padding_idx = len(dict_character) - 1 + return dict_character + + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(self.end_idx): + if text_prob is None and idx == 0: + continue + else: + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + if self.rm_symbol: + comp = re.compile('[^A-Z^a-z^0-9^\u4e00-\u9fa5]') + text = text.lower() + text = comp.sub('', text) + result_list.append((text, np.mean(conf_list).tolist())) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + if isinstance(preds, torch.Tensor): + preds = preds.cpu().numpy() + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + + if label is None: + return text + label = self.decode(label, is_remove_duplicate=False) + return text, label + + def get_ignored_tokens(self): + return [self.padding_idx] + + +class CANLabelDecode(BaseRecLabelDecode): + """ Convert between latex-symbol and symbol-index """ + + def __init__(self, character_dict_path=None, use_space_char=False, + **kwargs): + super(CANLabelDecode, self).__init__(character_dict_path, + use_space_char) + + def decode(self, text_index, preds_prob=None): + result_list = [] + batch_size = len(text_index) + for batch_idx in range(batch_size): + seq_end = text_index[batch_idx].argmin(0) + idx_list = text_index[batch_idx][:seq_end].tolist() + symbol_list = [self.character[idx] for idx in idx_list] + probs = [] + if preds_prob is not None: + probs = preds_prob[batch_idx][:len(symbol_list)].tolist() + + result_list.append([' '.join(symbol_list), probs]) + return result_list + + def __call__(self, preds, label=None, *args, **kwargs): + pred_prob, _, _, _ = preds + preds_idx = pred_prob.argmax(axis=2) + + text = self.decode(preds_idx) + if label is None: + return text + label = self.decode(label) + return text, label \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1bbbb48d0d77c61d1f1d9f134fece3c6db7c223d --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml @@ -0,0 +1,571 @@ +ch_ptocr_mobile_v2.0_cls_infer: + model_type: cls + algorithm: CLS + Transform: + Backbone: + name: MobileNetV3 + scale: 0.35 + model_name: small + Neck: + Head: + name: ClsHead + class_dim: 2 + +Multilingual_PP-OCRv3_det_infer: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +en_PP-OCRv3_det_infer: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +ch_PP-OCRv3_det_infer: + model_type: det + algorithm: DB + Transform: + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + disable_se: True + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +en_PP-OCRv4_rec_infer: + model_type: rec + algorithm: SVTR_LCNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + out_channels_list: + CTCLabelDecode: 97 #'blank' + ...(62) + ' ' + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 + +ch_PP-OCRv4_det_infer: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPLCNetV3 + scale: 0.75 + det: True + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +ch_PP-OCRv5_det_infer: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPLCNetV3 + scale: 0.75 + det: True + Neck: + name: RSEFPN + out_channels: 96 + shortcut: True + Head: + name: DBHead + k: 50 + +ch_PP-OCRv5_det_server_infer: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPHGNetV2_B4 + det: True + Neck: + name: LKPAN + out_channels: 256 + intracl: True + Head: + name: PFHeadLocal + k: 50 + mode: "large" + +ch_PP-OCRv4_det_server_infer: + model_type: det + algorithm: DB + Transform: null + Backbone: + name: PPHGNet_small + det: True + Neck: + name: LKPAN + out_channels: 256 + intracl: true + Head: + name: PFHeadLocal + k: 50 + mode: "large" + +ch_PP-OCRv4_rec_infer: + model_type: rec + algorithm: SVTR_LCNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + out_channels_list: + CTCLabelDecode: 6625 #'blank' + ...(6623) + ' ' + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 + +ch_PP-OCRv4_rec_server_infer: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPHGNet_small + Head: + name: MultiHead + out_channels_list: + CTCLabelDecode: 6625 #'blank' + ...(6623) + ' ' + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 + +ch_PP-OCRv4_rec_server_doc_infer: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPHGNet_small + Head: + name: MultiHead + out_channels_list: + CTCLabelDecode: 15631 + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 + +ch_PP-OCRv5_rec_server_infer: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPHGNetV2_B4 + text_rec: True + Head: + name: MultiHead + out_channels_list: + CTCLabelDecode: 18385 + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 + +ch_PP-OCRv5_rec_infer: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + out_channels_list: + CTCLabelDecode: 18385 + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 + +chinese_cht_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [1, 2] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 8423 + fc_decay: 0.00001 + +latin_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [ 1, 2 ] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 187 + fc_decay: 0.00001 + +cyrillic_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [ 1, 2 ] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 165 + fc_decay: 0.00001 + +arabic_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [ 1, 2 ] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 164 + fc_decay: 0.00001 + +korean_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [ 1, 2 ] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 3690 + fc_decay: 0.00001 + +japan_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [ 1, 2 ] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 4401 + fc_decay: 0.00001 + +ta_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [ 1, 2 ] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 130 + fc_decay: 0.00001 + +te_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [ 1, 2 ] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 153 + fc_decay: 0.00001 + +ka_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [ 1, 2 ] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 155 + fc_decay: 0.00001 + +devanagari_PP-OCRv3_rec_infer: + model_type: rec + algorithm: SVTR + Transform: + Backbone: + name: MobileNetV1Enhance + scale: 0.5 + last_conv_stride: [ 1, 2 ] + last_pool_type: avg + Neck: + name: SequenceEncoder + encoder_type: svtr + dims: 64 + depth: 2 + hidden_dims: 120 + use_guide: True + Head: + name: CTCHead +# out_channels: 169 + fc_decay: 0.00001 + +korean_PP-OCRv5_rec_infer: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + out_channels_list: + CTCLabelDecode: 11947 + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 + +latin_PP-OCRv5_rec_infer: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + out_channels_list: + CTCLabelDecode: 504 + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 + +eslav_PP-OCRv5_rec_infer: + model_type: rec + algorithm: SVTR_HGNet + Transform: + Backbone: + name: PPLCNetV3 + scale: 0.95 + Head: + name: MultiHead + out_channels_list: + CTCLabelDecode: 519 + head_list: + - CTCHead: + Neck: + name: svtr + dims: 120 + depth: 2 + hidden_dims: 120 + kernel_size: [ 1, 3 ] + use_guide: True + Head: + fc_decay: 0.00001 + - NRTRHead: + nrtr_dim: 384 + max_text_length: 25 + + diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..e97abf39274df77fbad066ee4635aebc6743140c --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt @@ -0,0 +1,162 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ء +آ +أ +ؤ +إ +ئ +ا +ب +ة +ت +ث +ج +ح +خ +د +ذ +ر +ز +س +ش +ص +ض +ط +ظ +ع +غ +ف +ق +ك +ل +م +ن +ه +و +ى +ي +ً +ٌ +ٍ +َ +ُ +ِ +ّ +ْ +ٓ +ٔ +ٰ +ٱ +ٹ +پ +چ +ڈ +ڑ +ژ +ک +ڭ +گ +ں +ھ +ۀ +ہ +ۂ +ۃ +ۆ +ۇ +ۈ +ۋ +ی +ې +ے +ۓ +ە +١ +٢ +٣ +٤ +٥ +٦ +٧ +٨ +٩ diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc1aa4724b9a6f0e15275bcf61c91c26b6550c3e --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt @@ -0,0 +1,8421 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +¥ +® +° +± +² +´ +· +» +É +Ë +Ó +× +Ü +à +á +ä +è +é +ì +í +ò +ó +÷ +ú +ü +ā +ē +ī +ō +ū +ǐ +ǒ +ɔ +ɡ +ʌ +ˋ +Λ +Ο +Φ +Ω +α +β +ε +θ +μ +π +З +И +Й +П +Я +г +— +‖ +‘ +’ +“ +” +• +… +‧ +′ +″ +※ +℃ +№ +™ +Ⅱ +Ⅲ +Ⅳ +← +↑ +→ +↓ +⇋ +∈ +∑ +√ +∞ +∣ +∧ +∩ +∫ +∶ +≈ +≠ +≤ +≥ +⊙ +⊥ +① +② +③ +④ +⑧ +⑴ +⑵ +⑶ +─ +│ +┅ +┌ +├ +█ +▎ +▏ +▕ +■ +□ +▪ +▲ +△ +▼ +◆ +◇ +○ +◎ +● +◥ +★ +☆ +❋ +❤ +  +、 +。 +〇 +〉 +《 +》 +「 +」 +『 +』 +【 +】 +〔 +〕 +〖 +〗 +の +サ +シ +ジ +マ +ㄱ +ㆍ +㎏ +㎡ +㐂 +㐱 +㙟 +㴪 +㸃 +䖝 +䝉 +䰾 +䲁 +一 +丁 +七 +丄 +丈 +三 +上 +下 +丌 +不 +与 +丏 +丐 +丑 +且 +丕 +世 +丘 +丙 +丞 +丟 +両 +並 +丨 +丫 +中 +丰 +串 +丶 +丸 +丹 +主 +丼 +丿 +乂 +乃 +久 +么 +之 +乍 +乎 +乏 +乒 +乓 +乖 +乗 +乘 +乙 +乚 +乜 +九 +乞 +也 +乩 +乭 +乳 +乸 +乹 +乾 +亀 +亂 +亅 +了 +予 +亊 +事 +二 +亍 +云 +互 +亓 +五 +井 +亘 +些 +亜 +亞 +亟 +亠 +亡 +亢 +交 +亥 +亦 +亨 +享 +京 +亭 +亮 +亰 +亳 +亶 +亹 +人 +亻 +什 +仁 +仂 +仃 +仄 +仇 +仉 +今 +介 +仍 +仏 +仔 +仕 +他 +仗 +付 +仙 +仛 +仝 +仞 +仟 +仡 +代 +令 +以 +仨 +仫 +仮 +仰 +仲 +仳 +仵 +件 +仺 +任 +仼 +份 +仿 +企 +伃 +伈 +伉 +伊 +伋 +伍 +伎 +伏 +伐 +休 +伕 +伙 +伝 +伢 +伯 +估 +伱 +伴 +伶 +伷 +伸 +伺 +似 +伽 +伾 +佀 +佁 +佃 +但 +佇 +佈 +佉 +佋 +位 +低 +住 +佐 +佑 +体 +佔 +何 +佗 +佘 +余 +佚 +佛 +作 +佝 +佞 +佟 +你 +佣 +佤 +佧 +佩 +佬 +佯 +佰 +佳 +併 +佶 +佹 +佺 +佼 +佾 +使 +侁 +侃 +侄 +侅 +來 +侈 +侊 +例 +侍 +侏 +侑 +侖 +侗 +侘 +侚 +供 +依 +侞 +価 +侮 +侯 +侵 +侶 +侷 +侹 +便 +俁 +係 +促 +俄 +俅 +俊 +俋 +俌 +俍 +俎 +俏 +俐 +俑 +俗 +俘 +俚 +俛 +保 +俞 +俟 +俠 +信 +俬 +修 +俯 +俱 +俳 +俴 +俵 +俶 +俸 +俺 +俽 +俾 +倆 +倈 +倉 +個 +倌 +倍 +們 +倒 +倓 +倔 +倖 +倗 +倘 +候 +倚 +倜 +倞 +借 +倡 +倢 +倣 +値 +倦 +倧 +倩 +倪 +倫 +倬 +倭 +倮 +倻 +值 +偁 +偃 +假 +偈 +偉 +偊 +偌 +偍 +偎 +偏 +偓 +偕 +做 +停 +健 +偪 +偲 +側 +偵 +偶 +偷 +偸 +偽 +傀 +傃 +傅 +傈 +傉 +傍 +傑 +傒 +傕 +傖 +傘 +備 +傜 +傢 +傣 +催 +傭 +傲 +傳 +債 +傷 +傻 +傾 +僅 +僉 +僊 +働 +像 +僑 +僔 +僕 +僖 +僙 +僚 +僜 +僡 +僧 +僩 +僭 +僮 +僰 +僱 +僳 +僴 +僵 +價 +僻 +儀 +儁 +儂 +億 +儆 +儇 +儈 +儉 +儋 +儐 +儒 +儔 +儕 +儘 +儚 +儞 +償 +儡 +儥 +儦 +優 +儫 +儱 +儲 +儷 +儺 +儻 +儼 +兀 +允 +元 +兄 +充 +兆 +先 +光 +克 +兌 +免 +児 +兒 +兔 +兕 +兗 +兜 +入 +內 +全 +兩 +兪 +八 +公 +六 +兮 +共 +兵 +其 +具 +典 +兼 +兿 +冀 +冂 +円 +冇 +冉 +冊 +再 +冏 +冑 +冒 +冕 +冖 +冗 +冚 +冠 +冢 +冤 +冥 +冧 +冨 +冪 +冫 +冬 +冮 +冰 +冴 +冶 +冷 +冼 +冽 +凃 +凄 +准 +凈 +凋 +凌 +凍 +凖 +凜 +凝 +凞 +几 +凡 +処 +凪 +凬 +凰 +凱 +凳 +凵 +凶 +凸 +凹 +出 +函 +刀 +刁 +刂 +刃 +刄 +分 +切 +刈 +刊 +刎 +刑 +划 +列 +初 +判 +別 +刦 +刧 +刨 +利 +刪 +刮 +到 +制 +刷 +券 +刺 +刻 +刼 +剁 +剃 +則 +削 +剋 +剌 +前 +剎 +剏 +剔 +剖 +剛 +剝 +剡 +剣 +剩 +剪 +剮 +副 +割 +創 +剿 +劃 +劄 +劇 +劈 +劉 +劊 +劌 +劍 +劑 +劔 +力 +功 +加 +劣 +助 +努 +劫 +劬 +劭 +劵 +効 +劼 +劾 +勁 +勃 +勅 +勇 +勉 +勐 +勑 +勒 +勔 +動 +勖 +勗 +勘 +務 +勛 +勝 +勞 +募 +勢 +勣 +勤 +勦 +勰 +勱 +勲 +勳 +勵 +勷 +勸 +勺 +勻 +勾 +勿 +匂 +匄 +包 +匆 +匈 +匋 +匍 +匏 +匐 +匕 +化 +北 +匙 +匚 +匝 +匠 +匡 +匣 +匪 +匯 +匱 +匸 +匹 +匾 +匿 +區 +十 +千 +卅 +升 +午 +卉 +半 +卋 +卍 +卐 +卑 +卒 +卓 +協 +南 +博 +卜 +卞 +卟 +占 +卡 +卣 +卦 +卧 +卩 +卬 +卮 +卯 +印 +危 +卲 +即 +卵 +卷 +卸 +卹 +卺 +卻 +卽 +卿 +厄 +厓 +厔 +厙 +厚 +厝 +原 +厥 +厭 +厰 +厲 +厴 +厶 +去 +參 +叄 +又 +叉 +及 +友 +反 +収 +叔 +叕 +取 +受 +叛 +叟 +叡 +叢 +口 +古 +句 +另 +叨 +叩 +只 +叫 +召 +叭 +叮 +可 +台 +叱 +史 +右 +叵 +司 +叻 +叼 +吁 +吃 +各 +吆 +合 +吉 +吊 +吋 +同 +名 +后 +吏 +吐 +向 +吒 +吔 +吖 +君 +吝 +吞 +吟 +吠 +吡 +吥 +否 +吧 +吩 +含 +吮 +吱 +吲 +吳 +吵 +吶 +吸 +吹 +吻 +吼 +吾 +呀 +呂 +呃 +呈 +呉 +告 +呋 +呎 +呢 +呤 +呦 +周 +呱 +味 +呵 +呷 +呸 +呼 +命 +呾 +咀 +咁 +咂 +咄 +咅 +咆 +咋 +和 +咎 +咑 +咒 +咔 +咕 +咖 +咗 +咘 +咚 +咟 +咤 +咥 +咧 +咨 +咩 +咪 +咫 +咬 +咭 +咯 +咱 +咲 +咳 +咸 +咻 +咼 +咽 +咾 +咿 +哀 +品 +哂 +哄 +哆 +哇 +哈 +哉 +哌 +哎 +哏 +哐 +哖 +哚 +哞 +員 +哥 +哦 +哨 +哩 +哪 +哭 +哮 +哱 +哲 +哺 +哼 +唃 +唄 +唆 +唇 +唉 +唏 +唐 +唑 +唔 +唘 +唧 +唫 +唬 +唭 +售 +唯 +唱 +唳 +唵 +唷 +唸 +唻 +唾 +啁 +啃 +啄 +商 +啉 +啊 +啍 +問 +啓 +啖 +啚 +啜 +啞 +啟 +啡 +啣 +啤 +啥 +啦 +啪 +啫 +啯 +啰 +啱 +啲 +啵 +啶 +啷 +啻 +啼 +啾 +喀 +喂 +喃 +善 +喆 +喇 +喈 +喉 +喊 +喋 +喏 +喔 +喘 +喙 +喚 +喜 +喝 +喢 +喦 +喧 +喪 +喫 +喬 +單 +喰 +喱 +喲 +喳 +喵 +喹 +喻 +喼 +嗄 +嗅 +嗆 +嗇 +嗊 +嗎 +嗑 +嗒 +嗓 +嗔 +嗖 +嗚 +嗜 +嗝 +嗞 +嗡 +嗢 +嗣 +嗦 +嗨 +嗩 +嗪 +嗮 +嗯 +嗲 +嗶 +嗹 +嗽 +嘀 +嘅 +嘆 +嘉 +嘌 +嘍 +嘎 +嘏 +嘔 +嘗 +嘚 +嘛 +嘜 +嘞 +嘟 +嘢 +嘣 +嘥 +嘧 +嘩 +嘬 +嘮 +嘯 +嘰 +嘲 +嘴 +嘶 +嘸 +嘹 +嘻 +嘿 +噁 +噌 +噍 +噏 +噓 +噗 +噝 +噠 +噢 +噤 +噥 +噦 +器 +噩 +噪 +噬 +噯 +噰 +噲 +噴 +噶 +噸 +噹 +噻 +嚇 +嚈 +嚎 +嚏 +嚐 +嚒 +嚓 +嚕 +嚗 +嚙 +嚞 +嚟 +嚤 +嚦 +嚧 +嚨 +嚩 +嚮 +嚳 +嚴 +嚶 +嚷 +嚼 +嚿 +囀 +囂 +囃 +囉 +囊 +囍 +囑 +囒 +囓 +囗 +囚 +四 +囝 +回 +因 +囡 +団 +囤 +囧 +囪 +囮 +囯 +困 +囲 +図 +囶 +囷 +囹 +固 +囿 +圂 +圃 +圄 +圈 +圉 +國 +圍 +圏 +園 +圓 +圖 +圗 +團 +圜 +土 +圧 +在 +圩 +圪 +圭 +圯 +地 +圳 +圻 +圾 +址 +均 +坊 +坋 +坌 +坍 +坎 +坐 +坑 +坖 +坡 +坣 +坤 +坦 +坨 +坩 +坪 +坫 +坬 +坭 +坮 +坯 +坳 +坵 +坶 +坷 +坻 +垂 +垃 +垈 +型 +垍 +垓 +垕 +垚 +垛 +垞 +垟 +垠 +垢 +垣 +垮 +垯 +垰 +垵 +垸 +垻 +垿 +埃 +埅 +埇 +埈 +埋 +埌 +城 +埏 +埒 +埔 +埕 +埗 +埜 +域 +埠 +埡 +埤 +埧 +埨 +埪 +埭 +埮 +埴 +埵 +執 +培 +基 +埻 +埼 +堀 +堂 +堃 +堅 +堆 +堇 +堈 +堉 +堊 +堍 +堖 +堝 +堡 +堤 +堦 +堪 +堮 +堯 +堰 +報 +場 +堵 +堷 +堺 +塀 +塅 +塆 +塊 +塋 +塌 +塍 +塏 +塑 +塔 +塗 +塘 +塙 +塜 +塞 +塡 +塢 +塤 +塨 +塩 +填 +塬 +塭 +塰 +塱 +塲 +塵 +塹 +塽 +塾 +墀 +境 +墅 +墉 +墊 +墎 +墓 +増 +墘 +墜 +增 +墟 +墡 +墣 +墨 +墩 +墫 +墬 +墮 +墱 +墳 +墺 +墼 +墾 +壁 +壄 +壆 +壇 +壋 +壌 +壎 +壐 +壑 +壓 +壔 +壕 +壘 +壙 +壞 +壟 +壠 +壢 +壤 +壩 +士 +壬 +壯 +壱 +壴 +壹 +壺 +壽 +夀 +夆 +変 +夊 +夋 +夌 +夏 +夔 +夕 +外 +夙 +多 +夜 +夠 +夢 +夤 +夥 +大 +天 +太 +夫 +夬 +夭 +央 +夯 +失 +夷 +夾 +奀 +奄 +奇 +奈 +奉 +奎 +奏 +奐 +契 +奓 +奔 +奕 +套 +奘 +奚 +奠 +奢 +奣 +奧 +奩 +奪 +奫 +奭 +奮 +女 +奴 +奶 +她 +好 +妀 +妁 +如 +妃 +妄 +妊 +妍 +妏 +妑 +妒 +妓 +妖 +妙 +妝 +妞 +妠 +妤 +妥 +妧 +妨 +妭 +妮 +妯 +妲 +妳 +妸 +妹 +妺 +妻 +妾 +姀 +姁 +姃 +姆 +姈 +姉 +姊 +始 +姌 +姍 +姐 +姑 +姒 +姓 +委 +姚 +姜 +姝 +姣 +姥 +姦 +姨 +姪 +姫 +姬 +姮 +姵 +姶 +姸 +姻 +姿 +威 +娃 +娉 +娋 +娌 +娍 +娎 +娑 +娖 +娘 +娛 +娜 +娟 +娠 +娣 +娥 +娩 +娫 +娳 +娶 +娸 +娼 +娽 +婀 +婁 +婆 +婉 +婊 +婑 +婕 +婚 +婢 +婦 +婧 +婪 +婭 +婯 +婷 +婺 +婻 +婼 +婿 +媃 +媄 +媊 +媐 +媒 +媓 +媖 +媗 +媚 +媛 +媜 +媞 +媧 +媭 +媯 +媲 +媳 +媺 +媼 +媽 +媾 +媿 +嫁 +嫂 +嫄 +嫈 +嫉 +嫌 +嫖 +嫘 +嫚 +嫡 +嫣 +嫦 +嫩 +嫪 +嫲 +嫳 +嫵 +嫺 +嫻 +嬅 +嬈 +嬉 +嬋 +嬌 +嬗 +嬛 +嬝 +嬡 +嬤 +嬨 +嬪 +嬬 +嬭 +嬰 +嬴 +嬸 +嬾 +嬿 +孀 +孃 +孆 +孋 +孌 +子 +孑 +孔 +孕 +孖 +字 +存 +孚 +孛 +孜 +孝 +孟 +孢 +季 +孤 +孩 +孫 +孬 +孮 +孰 +孳 +孵 +學 +孺 +孻 +孽 +孿 +宀 +它 +宅 +宇 +守 +安 +宋 +完 +宍 +宏 +宓 +宕 +宗 +官 +宙 +定 +宛 +宜 +実 +客 +宣 +室 +宥 +宦 +宧 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宿 +寀 +寁 +寂 +寄 +寅 +密 +寇 +寈 +寊 +富 +寐 +寒 +寓 +寔 +寕 +寖 +寗 +寘 +寛 +寜 +寞 +察 +寡 +寢 +寤 +寥 +實 +寧 +寨 +審 +寫 +寬 +寮 +寯 +寰 +寳 +寵 +寶 +寸 +寺 +対 +封 +専 +尃 +射 +將 +專 +尉 +尊 +尋 +對 +導 +小 +尐 +少 +尓 +尕 +尖 +尗 +尙 +尚 +尢 +尤 +尨 +尪 +尬 +就 +尷 +尹 +尺 +尻 +尼 +尾 +尿 +局 +屁 +屄 +居 +屆 +屇 +屈 +屋 +屌 +屍 +屎 +屏 +屐 +屑 +屓 +展 +屚 +屜 +屠 +屢 +層 +履 +屬 +屭 +屯 +山 +屹 +屺 +屻 +岀 +岈 +岌 +岐 +岑 +岔 +岡 +岢 +岣 +岧 +岩 +岪 +岫 +岬 +岰 +岱 +岳 +岵 +岷 +岸 +岻 +峁 +峅 +峇 +峋 +峍 +峒 +峘 +峙 +峚 +峠 +峨 +峩 +峪 +峭 +峯 +峰 +峴 +島 +峻 +峼 +峽 +崁 +崆 +崇 +崈 +崋 +崍 +崎 +崐 +崑 +崒 +崔 +崖 +崗 +崘 +崙 +崚 +崛 +崞 +崟 +崠 +崢 +崤 +崧 +崩 +崬 +崮 +崱 +崴 +崵 +崶 +崽 +嵇 +嵊 +嵋 +嵌 +嵎 +嵐 +嵒 +嵕 +嵖 +嵗 +嵙 +嵛 +嵜 +嵨 +嵩 +嵬 +嵮 +嵯 +嵰 +嵴 +嵻 +嵿 +嶁 +嶂 +嶃 +嶄 +嶇 +嶋 +嶌 +嶍 +嶒 +嶔 +嶗 +嶝 +嶠 +嶢 +嶦 +嶧 +嶪 +嶬 +嶰 +嶲 +嶴 +嶷 +嶸 +嶺 +嶼 +嶽 +巂 +巄 +巆 +巋 +巌 +巍 +巎 +巑 +巒 +巔 +巖 +巘 +巛 +川 +州 +巡 +巢 +工 +左 +巧 +巨 +巫 +差 +巰 +己 +已 +巳 +巴 +巶 +巷 +巻 +巽 +巾 +巿 +市 +布 +帆 +希 +帑 +帔 +帕 +帖 +帘 +帙 +帚 +帛 +帝 +帡 +帢 +帥 +師 +席 +帯 +帰 +帳 +帶 +帷 +常 +帽 +幀 +幃 +幄 +幅 +幌 +幔 +幕 +幗 +幚 +幛 +幟 +幡 +幢 +幣 +幪 +幫 +干 +平 +年 +幵 +幷 +幸 +幹 +幺 +幻 +幼 +幽 +幾 +庀 +庁 +広 +庇 +床 +序 +底 +庖 +店 +庚 +府 +庠 +庢 +庥 +度 +座 +庫 +庭 +庲 +庵 +庶 +康 +庸 +庹 +庼 +庾 +廁 +廂 +廄 +廆 +廈 +廉 +廊 +廋 +廌 +廍 +廑 +廓 +廔 +廕 +廖 +廙 +廚 +廝 +廞 +廟 +廠 +廡 +廢 +廣 +廧 +廨 +廩 +廬 +廰 +廱 +廳 +延 +廷 +廸 +建 +廻 +廼 +廿 +弁 +弄 +弅 +弇 +弈 +弉 +弊 +弋 +弍 +式 +弐 +弒 +弓 +弔 +引 +弖 +弗 +弘 +弛 +弟 +弢 +弦 +弧 +弨 +弩 +弭 +弱 +張 +強 +弸 +弼 +弾 +彀 +彄 +彅 +彆 +彈 +彊 +彌 +彎 +彐 +彔 +彖 +彗 +彘 +彙 +彜 +彞 +彠 +彡 +形 +彣 +彤 +彥 +彧 +彩 +彪 +彫 +彬 +彭 +彰 +影 +彳 +彷 +役 +彼 +彿 +往 +征 +徂 +待 +徇 +很 +徉 +徊 +律 +後 +徐 +徑 +徒 +得 +徘 +徙 +徜 +從 +徠 +御 +徧 +徨 +復 +循 +徫 +徬 +徭 +微 +徳 +徴 +徵 +德 +徸 +徹 +徽 +心 +忄 +必 +忉 +忌 +忍 +忐 +忑 +忒 +志 +忘 +忙 +応 +忝 +忞 +忠 +快 +忬 +忯 +忱 +忳 +念 +忻 +忽 +忿 +怍 +怎 +怒 +怕 +怖 +怙 +怛 +思 +怠 +怡 +急 +怦 +性 +怨 +怪 +怯 +怵 +恁 +恂 +恃 +恆 +恊 +恍 +恐 +恕 +恙 +恢 +恣 +恤 +恥 +恨 +恩 +恪 +恬 +恭 +息 +恰 +恵 +恿 +悄 +悅 +悆 +悉 +悌 +悍 +悔 +悖 +悚 +悛 +悝 +悞 +悟 +悠 +患 +悧 +您 +悪 +悰 +悲 +悳 +悵 +悶 +悸 +悼 +情 +惆 +惇 +惑 +惔 +惕 +惘 +惚 +惜 +惟 +惠 +惡 +惣 +惦 +惰 +惱 +惲 +想 +惶 +惹 +惺 +愁 +愃 +愆 +愈 +愉 +愍 +意 +愐 +愒 +愔 +愕 +愚 +愛 +愜 +感 +愣 +愧 +愨 +愫 +愭 +愴 +愷 +愼 +愾 +愿 +慄 +慈 +態 +慌 +慎 +慕 +慘 +慚 +慜 +慟 +慢 +慣 +慥 +慧 +慨 +慮 +慰 +慳 +慵 +慶 +慷 +慾 +憂 +憊 +憋 +憍 +憎 +憐 +憑 +憓 +憕 +憙 +憚 +憤 +憧 +憨 +憩 +憫 +憬 +憲 +憶 +憺 +憻 +憾 +懂 +懃 +懇 +懈 +應 +懋 +懌 +懍 +懐 +懣 +懦 +懮 +懲 +懵 +懶 +懷 +懸 +懺 +懼 +懽 +懾 +懿 +戀 +戇 +戈 +戊 +戌 +戍 +戎 +成 +我 +戒 +戔 +戕 +或 +戙 +戚 +戛 +戟 +戡 +戢 +戥 +戦 +戩 +截 +戮 +戰 +戱 +戲 +戳 +戴 +戶 +戸 +戻 +戽 +戾 +房 +所 +扁 +扆 +扇 +扈 +扉 +手 +扌 +才 +扎 +扒 +打 +扔 +托 +扙 +扛 +扞 +扣 +扥 +扦 +扭 +扮 +扯 +扳 +扶 +批 +扼 +找 +承 +技 +抃 +抄 +抇 +抉 +把 +抑 +抒 +抓 +投 +抖 +抗 +折 +抦 +披 +抬 +抱 +抵 +抹 +抻 +押 +抽 +抿 +拂 +拆 +拇 +拈 +拉 +拋 +拌 +拍 +拎 +拏 +拐 +拒 +拓 +拔 +拖 +拗 +拘 +拙 +拚 +招 +拜 +拝 +拡 +括 +拭 +拮 +拯 +拱 +拳 +拴 +拷 +拺 +拼 +拽 +拾 +拿 +持 +指 +按 +挎 +挑 +挖 +挙 +挨 +挪 +挫 +振 +挲 +挵 +挹 +挺 +挻 +挾 +捂 +捆 +捉 +捌 +捍 +捎 +捏 +捐 +捒 +捕 +捜 +捦 +捧 +捨 +捩 +捫 +捭 +捱 +捲 +捶 +捷 +捺 +捻 +掀 +掂 +掃 +掄 +掇 +授 +掉 +掌 +掏 +掐 +排 +掖 +掘 +掙 +掛 +掞 +掟 +掠 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掰 +掾 +揀 +揄 +揆 +揉 +揍 +描 +提 +插 +揔 +揖 +揚 +換 +握 +揪 +揭 +揮 +援 +揸 +揺 +損 +搏 +搐 +搓 +搔 +搖 +搗 +搜 +搞 +搠 +搢 +搪 +搬 +搭 +搳 +搴 +搵 +搶 +搽 +搾 +摂 +摒 +摔 +摘 +摜 +摞 +摟 +摠 +摧 +摩 +摭 +摯 +摳 +摴 +摵 +摶 +摸 +摹 +摺 +摻 +摽 +撃 +撇 +撈 +撐 +撒 +撓 +撕 +撖 +撙 +撚 +撞 +撣 +撤 +撥 +撩 +撫 +撬 +播 +撮 +撰 +撲 +撳 +撻 +撼 +撾 +撿 +擀 +擁 +擂 +擅 +擇 +擊 +擋 +操 +擎 +擒 +擔 +擘 +據 +擠 +擢 +擥 +擦 +擬 +擯 +擰 +擱 +擲 +擴 +擷 +擺 +擼 +擾 +攀 +攏 +攔 +攖 +攘 +攜 +攝 +攞 +攢 +攣 +攤 +攪 +攫 +攬 +支 +攴 +攵 +收 +攷 +攸 +改 +攻 +攽 +放 +政 +故 +效 +敍 +敎 +敏 +救 +敔 +敕 +敖 +敗 +敘 +教 +敝 +敞 +敟 +敢 +散 +敦 +敫 +敬 +敭 +敲 +整 +敵 +敷 +數 +敻 +敾 +斂 +斃 +文 +斌 +斎 +斐 +斑 +斕 +斖 +斗 +料 +斛 +斜 +斝 +斟 +斡 +斤 +斥 +斧 +斬 +斯 +新 +斷 +方 +於 +施 +斿 +旁 +旂 +旃 +旄 +旅 +旉 +旋 +旌 +旎 +族 +旖 +旗 +旙 +旛 +旡 +既 +日 +旦 +旨 +早 +旬 +旭 +旱 +旲 +旳 +旺 +旻 +旼 +旽 +旾 +旿 +昀 +昂 +昃 +昆 +昇 +昉 +昊 +昌 +昍 +明 +昏 +昐 +易 +昔 +昕 +昚 +昛 +昜 +昝 +昞 +星 +映 +昡 +昣 +昤 +春 +昧 +昨 +昪 +昫 +昭 +是 +昰 +昱 +昴 +昵 +昶 +昺 +晁 +時 +晃 +晈 +晉 +晊 +晏 +晗 +晙 +晚 +晛 +晝 +晞 +晟 +晤 +晦 +晧 +晨 +晩 +晪 +晫 +晭 +普 +景 +晰 +晳 +晴 +晶 +晷 +晸 +智 +晾 +暃 +暄 +暅 +暇 +暈 +暉 +暊 +暌 +暎 +暏 +暐 +暑 +暕 +暖 +暗 +暘 +暝 +暟 +暠 +暢 +暦 +暨 +暫 +暮 +暱 +暲 +暴 +暸 +暹 +暻 +暾 +曄 +曅 +曆 +曇 +曉 +曌 +曔 +曖 +曙 +曜 +曝 +曠 +曦 +曧 +曨 +曩 +曬 +曮 +曰 +曲 +曳 +更 +曶 +曷 +書 +曹 +曺 +曼 +曽 +曾 +替 +最 +會 +月 +有 +朊 +朋 +服 +朏 +朐 +朓 +朔 +朕 +朖 +朗 +望 +朝 +期 +朦 +朧 +木 +未 +末 +本 +札 +朱 +朴 +朵 +朶 +朽 +朿 +杁 +杉 +杋 +杌 +李 +杏 +材 +村 +杓 +杖 +杙 +杜 +杞 +束 +杠 +杣 +杤 +杧 +杬 +杭 +杯 +東 +杲 +杳 +杴 +杵 +杷 +杻 +杼 +松 +板 +极 +枇 +枉 +枋 +枏 +析 +枕 +枖 +林 +枚 +枛 +果 +枝 +枠 +枡 +枯 +枰 +枱 +枲 +枳 +架 +枷 +枸 +枹 +枼 +柁 +柃 +柄 +柉 +柊 +柎 +柏 +某 +柑 +柒 +染 +柔 +柘 +柚 +柜 +柝 +柞 +柟 +查 +柩 +柬 +柯 +柰 +柱 +柳 +柴 +柵 +柶 +柷 +査 +柾 +柿 +栃 +栄 +栐 +栒 +栓 +栜 +栝 +栞 +校 +栢 +栨 +栩 +株 +栲 +栴 +核 +根 +栻 +格 +栽 +桀 +桁 +桂 +桃 +桄 +桅 +框 +案 +桉 +桌 +桎 +桐 +桑 +桓 +桔 +桕 +桖 +桙 +桜 +桝 +桫 +桱 +桲 +桴 +桶 +桷 +桼 +桿 +梀 +梁 +梂 +梃 +梅 +梆 +梉 +梏 +梓 +梔 +梗 +梘 +條 +梟 +梠 +梢 +梣 +梧 +梨 +梫 +梭 +梯 +械 +梱 +梳 +梵 +梶 +梽 +棄 +棆 +棉 +棋 +棍 +棐 +棒 +棓 +棕 +棖 +棗 +棘 +棚 +棛 +棟 +棠 +棡 +棣 +棧 +棨 +棩 +棪 +棫 +森 +棱 +棲 +棵 +棶 +棹 +棺 +棻 +棼 +棽 +椅 +椆 +椇 +椋 +植 +椎 +椏 +椒 +椙 +椥 +椪 +椰 +椲 +椴 +椵 +椹 +椽 +椿 +楂 +楊 +楓 +楔 +楗 +楙 +楚 +楝 +楞 +楠 +楡 +楢 +楣 +楤 +楦 +楧 +楨 +楫 +業 +楮 +楯 +楳 +極 +楷 +楸 +楹 +楽 +楿 +概 +榆 +榊 +榍 +榎 +榑 +榔 +榕 +榖 +榗 +榘 +榛 +榜 +榞 +榢 +榣 +榤 +榦 +榧 +榨 +榫 +榭 +榮 +榲 +榴 +榷 +榻 +榿 +槀 +槁 +槃 +槊 +構 +槌 +槍 +槎 +槐 +槓 +槔 +槗 +様 +槙 +槤 +槩 +槭 +槰 +槱 +槲 +槳 +槺 +槻 +槼 +槽 +槿 +樀 +樁 +樂 +樅 +樆 +樊 +樋 +樑 +樓 +樗 +樘 +標 +樞 +樟 +模 +樣 +樨 +権 +樫 +樵 +樸 +樹 +樺 +樻 +樽 +樾 +橄 +橇 +橈 +橋 +橐 +橒 +橓 +橘 +橙 +橚 +機 +橡 +橢 +橪 +橫 +橿 +檀 +檄 +檇 +檉 +檊 +檎 +檐 +檔 +檗 +檜 +檞 +檠 +檡 +檢 +檣 +檦 +檨 +檫 +檬 +檯 +檳 +檵 +檸 +檻 +檽 +櫂 +櫃 +櫆 +櫈 +櫓 +櫚 +櫛 +櫞 +櫟 +櫥 +櫨 +櫪 +櫱 +櫸 +櫻 +櫾 +櫿 +欄 +欉 +權 +欏 +欒 +欖 +欞 +欠 +次 +欣 +欥 +欲 +欸 +欹 +欺 +欽 +款 +歆 +歇 +歉 +歊 +歌 +歎 +歐 +歓 +歙 +歛 +歡 +止 +正 +此 +步 +武 +歧 +歩 +歪 +歲 +歳 +歴 +歷 +歸 +歹 +死 +歿 +殂 +殃 +殄 +殆 +殉 +殊 +殑 +殖 +殘 +殛 +殞 +殟 +殤 +殭 +殮 +殯 +殲 +殳 +段 +殷 +殺 +殻 +殼 +殿 +毀 +毅 +毆 +毉 +毋 +毌 +母 +毎 +每 +毐 +毒 +毓 +比 +毖 +毗 +毘 +毛 +毫 +毬 +毯 +毴 +毸 +毽 +毿 +氂 +氈 +氍 +氏 +氐 +民 +氓 +氖 +気 +氘 +氙 +氚 +氛 +氟 +氣 +氦 +氧 +氨 +氪 +氫 +氬 +氮 +氯 +氰 +水 +氵 +氷 +永 +氹 +氻 +氽 +氾 +汀 +汁 +求 +汊 +汎 +汐 +汕 +汗 +汛 +汜 +汝 +汞 +江 +池 +污 +汧 +汨 +汩 +汪 +汭 +汰 +汲 +汴 +汶 +決 +汽 +汾 +沁 +沂 +沃 +沄 +沅 +沆 +沇 +沈 +沉 +沌 +沍 +沏 +沐 +沒 +沓 +沔 +沖 +沘 +沙 +沚 +沛 +沜 +沢 +沨 +沫 +沭 +沮 +沯 +沱 +河 +沸 +油 +沺 +治 +沼 +沽 +沾 +沿 +況 +泂 +泄 +泆 +泇 +泉 +泊 +泌 +泐 +泓 +泔 +法 +泖 +泗 +泚 +泛 +泠 +泡 +波 +泣 +泥 +泩 +泫 +泮 +泯 +泰 +泱 +泳 +泵 +洄 +洋 +洌 +洎 +洗 +洙 +洛 +洞 +洢 +洣 +洤 +津 +洨 +洩 +洪 +洮 +洱 +洲 +洳 +洵 +洸 +洹 +洺 +活 +洽 +派 +流 +浄 +浙 +浚 +浛 +浜 +浞 +浟 +浠 +浡 +浣 +浤 +浥 +浦 +浩 +浪 +浮 +浯 +浴 +浵 +海 +浸 +浹 +涅 +涇 +消 +涉 +涌 +涎 +涑 +涓 +涔 +涕 +涙 +涪 +涫 +涮 +涯 +液 +涵 +涸 +涼 +涿 +淄 +淅 +淆 +淇 +淋 +淌 +淍 +淎 +淏 +淑 +淓 +淖 +淘 +淙 +淚 +淛 +淝 +淞 +淠 +淡 +淤 +淥 +淦 +淨 +淩 +淪 +淫 +淬 +淮 +淯 +淰 +深 +淳 +淵 +淶 +混 +淸 +淹 +淺 +添 +淼 +淽 +渃 +清 +済 +渉 +渋 +渕 +渙 +渚 +減 +渝 +渟 +渠 +渡 +渣 +渤 +渥 +渦 +渫 +測 +渭 +港 +渲 +渴 +游 +渺 +渼 +渽 +渾 +湃 +湄 +湉 +湊 +湍 +湓 +湔 +湖 +湘 +湛 +湜 +湞 +湟 +湣 +湥 +湧 +湫 +湮 +湯 +湳 +湴 +湼 +満 +溁 +溇 +溈 +溉 +溋 +溎 +溏 +源 +準 +溙 +溜 +溝 +溟 +溢 +溥 +溦 +溧 +溪 +溫 +溯 +溱 +溲 +溴 +溵 +溶 +溺 +溼 +滀 +滁 +滂 +滄 +滅 +滇 +滈 +滉 +滋 +滌 +滎 +滏 +滑 +滓 +滔 +滕 +滘 +滙 +滝 +滬 +滯 +滲 +滴 +滷 +滸 +滹 +滻 +滽 +滾 +滿 +漁 +漂 +漆 +漇 +漈 +漎 +漏 +漓 +演 +漕 +漚 +漠 +漢 +漣 +漩 +漪 +漫 +漬 +漯 +漱 +漲 +漳 +漴 +漵 +漷 +漸 +漼 +漾 +漿 +潁 +潑 +潔 +潘 +潛 +潞 +潟 +潢 +潤 +潭 +潮 +潯 +潰 +潲 +潺 +潼 +潽 +潾 +潿 +澀 +澁 +澂 +澄 +澆 +澇 +澈 +澉 +澋 +澌 +澍 +澎 +澔 +澗 +澠 +澡 +澣 +澤 +澥 +澧 +澪 +澮 +澯 +澱 +澳 +澶 +澹 +澻 +激 +濁 +濂 +濃 +濉 +濊 +濋 +濕 +濘 +濙 +濛 +濞 +濟 +濠 +濡 +濤 +濫 +濬 +濮 +濯 +濰 +濱 +濲 +濶 +濺 +濼 +濾 +瀁 +瀅 +瀆 +瀉 +瀍 +瀏 +瀑 +瀔 +瀕 +瀘 +瀚 +瀛 +瀝 +瀞 +瀟 +瀠 +瀣 +瀦 +瀧 +瀨 +瀬 +瀰 +瀲 +瀴 +瀶 +瀹 +瀾 +灃 +灊 +灌 +灑 +灘 +灝 +灞 +灡 +灣 +灤 +灧 +火 +灰 +灴 +灸 +灼 +災 +炁 +炅 +炆 +炊 +炎 +炒 +炔 +炕 +炘 +炙 +炟 +炣 +炤 +炫 +炬 +炭 +炮 +炯 +炱 +炲 +炳 +炷 +炸 +為 +炻 +烈 +烉 +烊 +烋 +烏 +烒 +烔 +烘 +烙 +烜 +烝 +烤 +烯 +烱 +烴 +烷 +烹 +烺 +烽 +焃 +焄 +焉 +焊 +焌 +焓 +焗 +焙 +焚 +焜 +焞 +無 +焦 +焯 +焰 +焱 +焴 +然 +焻 +焼 +焿 +煇 +煉 +煊 +煌 +煎 +煐 +煒 +煔 +煕 +煖 +煙 +煚 +煜 +煞 +煠 +煤 +煥 +煦 +照 +煨 +煩 +煬 +煮 +煲 +煳 +煵 +煶 +煸 +煽 +熄 +熅 +熇 +熈 +熊 +熏 +熒 +熔 +熖 +熗 +熘 +熙 +熜 +熟 +熠 +熤 +熥 +熨 +熬 +熯 +熱 +熲 +熳 +熵 +熹 +熺 +熼 +熾 +熿 +燁 +燃 +燄 +燈 +燉 +燊 +燎 +燏 +燐 +燒 +燔 +燕 +燘 +燙 +燚 +燜 +燝 +營 +燥 +燦 +燧 +燫 +燬 +燭 +燮 +燴 +燹 +燻 +燼 +燾 +燿 +爀 +爆 +爌 +爍 +爐 +爔 +爚 +爛 +爝 +爨 +爪 +爬 +爭 +爯 +爰 +爲 +爵 +父 +爸 +爹 +爺 +爻 +爽 +爾 +爿 +牁 +牂 +牆 +片 +版 +牌 +牒 +牕 +牖 +牘 +牙 +牛 +牝 +牟 +牠 +牡 +牢 +牧 +物 +牯 +牲 +特 +牻 +牼 +牽 +犀 +犁 +犂 +犇 +犍 +犎 +犖 +犛 +犢 +犧 +犨 +犬 +犯 +犰 +犴 +犽 +狀 +狂 +狄 +狍 +狎 +狐 +狒 +狓 +狗 +狙 +狛 +狟 +狠 +狡 +狦 +狨 +狩 +狳 +狶 +狷 +狸 +狹 +狻 +狼 +猁 +猄 +猇 +猊 +猗 +猙 +猛 +猜 +猝 +猞 +猢 +猥 +猨 +猩 +猳 +猴 +猶 +猷 +猺 +猻 +猾 +猿 +獁 +獃 +獄 +獅 +獇 +獎 +獏 +獐 +獒 +獠 +獢 +獣 +獨 +獬 +獮 +獯 +獰 +獲 +獴 +獵 +獷 +獸 +獺 +獻 +獼 +獾 +玀 +玄 +玆 +率 +玉 +王 +玎 +玏 +玓 +玕 +玖 +玗 +玘 +玙 +玟 +玠 +玡 +玢 +玥 +玧 +玨 +玩 +玫 +玭 +玲 +玳 +玶 +玷 +玹 +玻 +玾 +珀 +珂 +珅 +珈 +珉 +珊 +珌 +珍 +珎 +珏 +珖 +珙 +珝 +珞 +珠 +珡 +珣 +珤 +珥 +珦 +珧 +珩 +珪 +班 +珮 +珵 +珹 +珺 +珽 +現 +琁 +球 +琄 +琅 +理 +琇 +琉 +琊 +琍 +琎 +琚 +琛 +琡 +琢 +琤 +琥 +琦 +琨 +琪 +琬 +琮 +琯 +琰 +琱 +琳 +琴 +琵 +琶 +琹 +琺 +琿 +瑀 +瑁 +瑂 +瑄 +瑅 +瑆 +瑈 +瑊 +瑋 +瑑 +瑒 +瑕 +瑗 +瑙 +瑚 +瑛 +瑜 +瑝 +瑞 +瑟 +瑠 +瑢 +瑣 +瑤 +瑥 +瑧 +瑨 +瑩 +瑪 +瑭 +瑯 +瑰 +瑱 +瑳 +瑴 +瑺 +瑾 +璀 +璁 +璃 +璄 +璆 +璇 +璈 +璉 +璋 +璌 +璐 +璕 +璘 +璙 +璚 +璜 +璞 +璟 +璠 +璡 +璣 +璥 +璦 +璧 +璨 +璩 +璪 +璫 +璬 +璮 +環 +璱 +璵 +璸 +璹 +璽 +璿 +瓈 +瓊 +瓌 +瓏 +瓑 +瓔 +瓖 +瓘 +瓚 +瓛 +瓜 +瓞 +瓠 +瓢 +瓣 +瓤 +瓦 +瓮 +瓴 +瓶 +瓷 +瓿 +甂 +甄 +甌 +甍 +甑 +甕 +甘 +甙 +甚 +甜 +生 +甡 +產 +産 +甥 +甦 +用 +甩 +甪 +甫 +甬 +甯 +田 +由 +甲 +申 +男 +甸 +甹 +町 +甾 +畀 +畇 +畈 +畊 +畋 +界 +畎 +畏 +畐 +畑 +畔 +留 +畜 +畝 +畠 +畢 +略 +畦 +畧 +番 +畫 +畬 +畯 +異 +畲 +畳 +畵 +當 +畷 +畸 +畹 +畿 +疃 +疆 +疇 +疊 +疋 +疌 +疍 +疏 +疑 +疒 +疕 +疙 +疚 +疝 +疣 +疤 +疥 +疫 +疲 +疳 +疵 +疸 +疹 +疼 +疽 +疾 +痂 +病 +症 +痊 +痍 +痔 +痕 +痘 +痙 +痛 +痞 +痟 +痠 +痢 +痣 +痤 +痧 +痩 +痰 +痱 +痲 +痴 +痹 +痺 +痿 +瘀 +瘁 +瘊 +瘋 +瘍 +瘓 +瘙 +瘜 +瘞 +瘟 +瘠 +瘡 +瘢 +瘤 +瘦 +瘧 +瘩 +瘰 +瘴 +瘺 +癀 +療 +癆 +癇 +癌 +癒 +癖 +癘 +癜 +癟 +癡 +癢 +癤 +癥 +癩 +癬 +癭 +癮 +癯 +癰 +癱 +癲 +癸 +発 +登 +發 +白 +百 +皂 +的 +皆 +皇 +皈 +皋 +皎 +皐 +皓 +皖 +皙 +皚 +皛 +皝 +皞 +皮 +皰 +皴 +皷 +皸 +皺 +皿 +盂 +盃 +盅 +盆 +盈 +益 +盋 +盌 +盎 +盒 +盔 +盛 +盜 +盞 +盟 +盡 +監 +盤 +盥 +盦 +盧 +盨 +盩 +盪 +盫 +目 +盯 +盱 +盲 +直 +盷 +相 +盹 +盺 +盼 +盾 +眀 +省 +眉 +看 +県 +眙 +眛 +眜 +眞 +真 +眠 +眥 +眨 +眩 +眭 +眯 +眵 +眶 +眷 +眸 +眺 +眼 +眾 +着 +睇 +睛 +睜 +睞 +睡 +睢 +督 +睥 +睦 +睨 +睪 +睫 +睭 +睹 +睺 +睽 +睾 +睿 +瞄 +瞅 +瞋 +瞌 +瞎 +瞑 +瞓 +瞞 +瞢 +瞥 +瞧 +瞪 +瞫 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞽 +瞿 +矇 +矍 +矗 +矚 +矛 +矜 +矞 +矢 +矣 +知 +矧 +矩 +短 +矮 +矯 +石 +矸 +矽 +砂 +砋 +砌 +砍 +砒 +研 +砝 +砢 +砥 +砦 +砧 +砩 +砫 +砭 +砮 +砯 +砰 +砲 +砳 +破 +砵 +砷 +砸 +砼 +硂 +硃 +硅 +硇 +硏 +硐 +硒 +硓 +硚 +硜 +硝 +硤 +硨 +硫 +硬 +硭 +硯 +硼 +碁 +碇 +碉 +碌 +碎 +碑 +碓 +碕 +碗 +碘 +碚 +碟 +碡 +碣 +碧 +碩 +碪 +碭 +碰 +碲 +碳 +碴 +碶 +碸 +確 +碻 +碼 +碽 +碾 +磁 +磅 +磊 +磋 +磐 +磔 +磕 +磘 +磙 +磚 +磜 +磡 +磨 +磪 +磬 +磯 +磱 +磲 +磵 +磷 +磺 +磻 +磾 +礁 +礄 +礎 +礐 +礑 +礒 +礙 +礠 +礦 +礪 +礫 +礬 +礮 +礱 +礴 +示 +礻 +礽 +社 +祀 +祁 +祂 +祆 +祇 +祈 +祉 +祋 +祏 +祐 +祓 +祕 +祖 +祗 +祙 +祚 +祛 +祜 +祝 +神 +祟 +祠 +祥 +祧 +票 +祭 +祹 +祺 +祼 +祿 +禁 +禃 +禇 +禍 +禎 +福 +禑 +禓 +禔 +禕 +禘 +禛 +禟 +禠 +禤 +禦 +禧 +禨 +禩 +禪 +禮 +禰 +禱 +禵 +禹 +禺 +禼 +禽 +禾 +禿 +秀 +私 +秈 +秉 +秋 +科 +秒 +秕 +秘 +租 +秠 +秣 +秤 +秦 +秧 +秩 +秭 +秳 +秸 +移 +稀 +稅 +稈 +稉 +程 +稍 +稑 +稔 +稗 +稘 +稙 +稚 +稜 +稞 +稟 +稠 +種 +稱 +稲 +稷 +稹 +稺 +稻 +稼 +稽 +稾 +稿 +穀 +穂 +穆 +穈 +穉 +穌 +積 +穎 +穗 +穟 +穠 +穡 +穢 +穣 +穩 +穫 +穰 +穴 +穵 +究 +穹 +空 +穿 +突 +窄 +窅 +窈 +窋 +窒 +窕 +窖 +窗 +窘 +窟 +窠 +窣 +窨 +窩 +窪 +窮 +窯 +窰 +窶 +窺 +窿 +竄 +竅 +竇 +竈 +竊 +立 +竑 +站 +竜 +竟 +章 +竣 +童 +竦 +竩 +竭 +端 +競 +竹 +竺 +竻 +竿 +笄 +笆 +笈 +笏 +笑 +笘 +笙 +笛 +笞 +笠 +笥 +符 +笨 +笩 +笪 +第 +笭 +笮 +笯 +笱 +笳 +笹 +筅 +筆 +等 +筊 +筋 +筌 +筍 +筏 +筐 +筒 +答 +策 +筘 +筠 +筥 +筦 +筧 +筬 +筭 +筱 +筲 +筳 +筵 +筶 +筷 +筻 +箆 +箇 +箋 +箍 +箏 +箐 +箑 +箒 +箔 +箕 +算 +箜 +管 +箬 +箭 +箱 +箴 +箸 +節 +篁 +範 +篆 +篇 +築 +篊 +篋 +篌 +篔 +篙 +篝 +篠 +篡 +篤 +篥 +篦 +篩 +篪 +篭 +篯 +篳 +篷 +簀 +簃 +簇 +簉 +簋 +簍 +簑 +簕 +簗 +簞 +簠 +簡 +簧 +簪 +簫 +簷 +簸 +簹 +簺 +簽 +簾 +簿 +籀 +籃 +籌 +籍 +籐 +籙 +籛 +籜 +籝 +籟 +籠 +籣 +籤 +籥 +籪 +籬 +籮 +籲 +米 +籽 +籾 +粄 +粉 +粍 +粑 +粒 +粕 +粗 +粘 +粟 +粢 +粥 +粦 +粧 +粩 +粱 +粲 +粳 +粵 +粹 +粼 +粽 +精 +粿 +糀 +糅 +糊 +糌 +糍 +糎 +糕 +糖 +糙 +糜 +糝 +糞 +糟 +糠 +糢 +糧 +糬 +糯 +糰 +糴 +糶 +糸 +糹 +糺 +系 +糾 +紀 +紂 +約 +紅 +紆 +紇 +紈 +紉 +紊 +紋 +納 +紐 +紑 +紓 +純 +紕 +紗 +紘 +紙 +級 +紛 +紜 +紝 +紞 +素 +紡 +索 +紫 +紮 +累 +細 +紱 +紲 +紳 +紵 +紹 +紺 +紿 +終 +絃 +組 +絆 +経 +絎 +結 +絕 +絛 +絜 +絞 +絡 +絢 +給 +絨 +絪 +絮 +統 +絲 +絳 +絵 +絶 +絹 +絺 +綁 +綃 +綈 +綉 +綎 +綏 +經 +綖 +継 +続 +綜 +綝 +綞 +綠 +綢 +綣 +綦 +綧 +綫 +綬 +維 +綮 +綰 +綱 +網 +綳 +綴 +綸 +綺 +綻 +綽 +綾 +綿 +緁 +緃 +緄 +緈 +緊 +緋 +総 +緑 +緒 +緖 +緘 +線 +緜 +緝 +緞 +締 +緡 +緣 +緤 +編 +緩 +緬 +緯 +緱 +緲 +練 +緹 +緻 +縂 +縄 +縈 +縉 +縊 +縕 +縛 +縝 +縞 +縠 +縡 +縣 +縤 +縫 +縮 +縯 +縱 +縴 +縵 +縷 +縹 +縻 +總 +績 +繁 +繃 +繆 +繇 +繒 +織 +繕 +繖 +繙 +繚 +繞 +繡 +繩 +繪 +繫 +繭 +繰 +繳 +繹 +繻 +繼 +繽 +繾 +纁 +纂 +纈 +續 +纍 +纏 +纓 +纔 +纕 +纖 +纘 +纛 +纜 +缐 +缶 +缸 +缺 +缽 +罃 +罄 +罅 +罈 +罉 +罌 +罍 +罐 +罔 +罕 +罘 +罟 +罡 +罨 +罩 +罪 +置 +罰 +罱 +署 +罳 +罵 +罶 +罷 +罹 +罽 +羂 +羅 +羆 +羈 +羊 +羋 +羌 +美 +羔 +羕 +羗 +羙 +羚 +羞 +羡 +羣 +群 +羥 +羧 +羨 +義 +羯 +羰 +羱 +羲 +羸 +羹 +羽 +羿 +翀 +翁 +翂 +翃 +翅 +翊 +翌 +翎 +翏 +習 +翔 +翕 +翙 +翜 +翟 +翠 +翡 +翥 +翦 +翩 +翬 +翮 +翰 +翱 +翳 +翹 +翻 +翼 +耀 +老 +考 +耄 +者 +耆 +而 +耍 +耎 +耐 +耑 +耒 +耔 +耕 +耗 +耘 +耙 +耜 +耦 +耨 +耬 +耳 +耵 +耶 +耷 +耽 +耿 +聃 +聆 +聊 +聒 +聖 +聘 +聚 +聞 +聟 +聨 +聯 +聰 +聱 +聲 +聳 +聴 +聶 +職 +聽 +聾 +聿 +肄 +肅 +肆 +肇 +肉 +肋 +肌 +肏 +肖 +肘 +肚 +肛 +肜 +肝 +肟 +股 +肢 +肥 +肩 +肪 +肫 +肯 +肱 +育 +肸 +肹 +肺 +肼 +肽 +胂 +胃 +胄 +胅 +胇 +胊 +背 +胍 +胎 +胖 +胗 +胙 +胚 +胛 +胝 +胞 +胡 +胤 +胥 +胬 +胭 +胰 +胱 +胳 +胴 +胸 +胺 +胼 +能 +脂 +脅 +脆 +脇 +脈 +脊 +脒 +脖 +脘 +脛 +脣 +脩 +脫 +脬 +脭 +脯 +脲 +脳 +脷 +脹 +脾 +腆 +腈 +腊 +腋 +腌 +腎 +腐 +腑 +腓 +腔 +腕 +腥 +腦 +腧 +腩 +腫 +腮 +腰 +腱 +腳 +腴 +腸 +腹 +腺 +腿 +膀 +膂 +膈 +膊 +膏 +膚 +膛 +膜 +膝 +膠 +膣 +膥 +膦 +膨 +膩 +膮 +膳 +膺 +膽 +膾 +膿 +臀 +臂 +臃 +臆 +臉 +臊 +臍 +臏 +臘 +臚 +臞 +臟 +臠 +臣 +臧 +臨 +自 +臭 +臯 +至 +致 +臺 +臻 +臼 +臾 +舂 +舅 +與 +興 +舉 +舊 +舌 +舍 +舎 +舒 +舔 +舖 +舘 +舛 +舜 +舞 +舟 +舢 +舥 +舨 +舩 +航 +舫 +般 +舲 +舵 +舶 +舷 +舸 +船 +舺 +艅 +艇 +艉 +艋 +艎 +艏 +艔 +艘 +艙 +艚 +艦 +艮 +良 +艱 +色 +艶 +艷 +艸 +艽 +艾 +艿 +芃 +芊 +芋 +芍 +芎 +芑 +芒 +芘 +芙 +芛 +芝 +芡 +芥 +芨 +芩 +芪 +芫 +芬 +芭 +芮 +芯 +花 +芳 +芴 +芷 +芸 +芹 +芻 +芽 +芾 +苄 +苅 +苑 +苒 +苓 +苔 +苕 +苗 +苛 +苜 +苝 +苞 +苟 +苡 +苣 +苤 +若 +苦 +苧 +苪 +苫 +苯 +英 +苳 +苴 +苷 +苺 +苻 +苼 +苾 +茀 +茁 +茂 +范 +茄 +茅 +茆 +茇 +茈 +茉 +茌 +茗 +茘 +茚 +茛 +茜 +茝 +茨 +茫 +茬 +茭 +茮 +茯 +茱 +茲 +茴 +茵 +茶 +茷 +茸 +茹 +茺 +茼 +荀 +荃 +荅 +荇 +草 +荊 +荎 +荏 +荒 +荔 +荖 +荘 +荳 +荷 +荸 +荻 +荼 +荽 +莆 +莉 +莊 +莎 +莒 +莓 +莕 +莖 +莘 +莙 +莛 +莜 +莞 +莠 +莢 +莧 +莨 +莩 +莪 +莫 +莽 +莿 +菀 +菁 +菅 +菇 +菈 +菉 +菊 +菌 +菍 +菏 +菑 +菓 +菔 +菖 +菘 +菜 +菝 +菟 +菠 +菡 +菥 +菩 +菪 +菫 +華 +菰 +菱 +菲 +菴 +菶 +菸 +菹 +菺 +菼 +菽 +菾 +萁 +萃 +萄 +萇 +萊 +萌 +萍 +萎 +萐 +萘 +萜 +萠 +萡 +萣 +萩 +萬 +萭 +萱 +萵 +萸 +萹 +萼 +落 +葃 +葆 +葉 +葊 +葎 +葑 +葒 +著 +葙 +葚 +葛 +葜 +葝 +葡 +董 +葦 +葩 +葫 +葬 +葭 +葯 +葰 +葳 +葵 +葶 +葷 +葺 +蒂 +蒄 +蒍 +蒎 +蒐 +蒓 +蒔 +蒗 +蒙 +蒜 +蒞 +蒟 +蒡 +蒢 +蒤 +蒧 +蒨 +蒭 +蒯 +蒲 +蒴 +蒸 +蒹 +蒺 +蒻 +蒼 +蒽 +蒾 +蒿 +蓀 +蓁 +蓂 +蓄 +蓆 +蓉 +蓋 +蓍 +蓑 +蓓 +蓖 +蓘 +蓚 +蓧 +蓨 +蓪 +蓬 +蓭 +蓮 +蓯 +蓳 +蓼 +蓽 +蓿 +蔆 +蔎 +蔑 +蔓 +蔔 +蔕 +蔗 +蔘 +蔚 +蔝 +蔞 +蔡 +蔣 +蔥 +蔦 +蔬 +蔭 +蔴 +蔵 +蔻 +蔽 +蕁 +蕃 +蕅 +蕈 +蕉 +蕊 +蕎 +蕑 +蕒 +蕖 +蕘 +蕙 +蕚 +蕟 +蕡 +蕢 +蕤 +蕨 +蕩 +蕪 +蕭 +蕷 +蕹 +蕺 +蕻 +蕾 +薀 +薄 +薆 +薇 +薈 +薊 +薌 +薏 +薐 +薑 +薔 +薗 +薘 +薙 +薛 +薜 +薞 +薟 +薡 +薦 +薨 +薩 +薪 +薫 +薬 +薯 +薰 +薲 +薷 +薸 +薹 +薺 +薾 +薿 +藁 +藉 +藍 +藎 +藏 +藐 +藔 +藕 +藜 +藝 +藟 +藤 +藥 +藦 +藨 +藩 +藪 +藶 +藸 +藹 +藺 +藻 +藿 +蘂 +蘄 +蘅 +蘆 +蘇 +蘊 +蘋 +蘐 +蘑 +蘓 +蘗 +蘘 +蘚 +蘞 +蘢 +蘧 +蘩 +蘭 +蘵 +蘶 +蘸 +蘼 +蘿 +虉 +虎 +虐 +虓 +虔 +處 +虖 +虛 +虜 +虞 +號 +虢 +虧 +虨 +虯 +虱 +虵 +虹 +虺 +虻 +蚆 +蚊 +蚋 +蚌 +蚍 +蚓 +蚖 +蚜 +蚝 +蚡 +蚢 +蚣 +蚤 +蚧 +蚨 +蚩 +蚪 +蚯 +蚱 +蚴 +蚵 +蚶 +蚺 +蚼 +蛀 +蛄 +蛇 +蛉 +蛋 +蛍 +蛐 +蛑 +蛔 +蛙 +蛛 +蛞 +蛟 +蛤 +蛭 +蛯 +蛸 +蛹 +蛺 +蛻 +蛾 +蜀 +蜂 +蜃 +蜆 +蜇 +蜈 +蜉 +蜊 +蜍 +蜑 +蜒 +蜓 +蜘 +蜚 +蜛 +蜜 +蜞 +蜢 +蜣 +蜥 +蜨 +蜮 +蜯 +蜱 +蜴 +蜷 +蜻 +蜾 +蜿 +蝀 +蝌 +蝍 +蝎 +蝓 +蝕 +蝗 +蝘 +蝙 +蝚 +蝟 +蝠 +蝣 +蝤 +蝦 +蝨 +蝮 +蝯 +蝰 +蝲 +蝴 +蝶 +蝸 +蝽 +螂 +螃 +螄 +螅 +螈 +螋 +融 +螐 +螔 +螞 +螟 +螠 +螢 +螣 +螥 +螫 +螭 +螯 +螳 +螶 +螺 +螻 +螽 +螾 +蟀 +蟄 +蟅 +蟆 +蟊 +蟋 +蟌 +蟎 +蟑 +蟒 +蟜 +蟠 +蟥 +蟪 +蟫 +蟬 +蟯 +蟲 +蟳 +蟴 +蟶 +蟹 +蟻 +蟾 +蠂 +蠃 +蠄 +蠅 +蠆 +蠊 +蠋 +蠍 +蠐 +蠑 +蠓 +蠔 +蠕 +蠖 +蠘 +蠙 +蠟 +蠡 +蠢 +蠣 +蠱 +蠲 +蠵 +蠶 +蠷 +蠹 +蠻 +血 +衂 +衆 +行 +衍 +衎 +術 +衕 +衖 +街 +衙 +衚 +衛 +衜 +衝 +衞 +衡 +衢 +衣 +表 +衩 +衫 +衰 +衲 +衷 +衽 +衾 +衿 +袁 +袂 +袈 +袋 +袍 +袓 +袖 +袛 +袞 +袤 +袪 +被 +袱 +袴 +袾 +裁 +裂 +裊 +裎 +裒 +裔 +裕 +裖 +裘 +裙 +補 +裝 +裟 +裡 +裨 +裬 +裱 +裳 +裴 +裵 +裸 +裹 +製 +裾 +裿 +褀 +褂 +複 +褌 +褍 +褎 +褐 +褒 +褓 +褔 +褘 +褙 +褚 +褞 +褥 +褧 +褪 +褫 +褭 +褲 +褶 +褸 +褻 +襄 +襌 +襖 +襞 +襟 +襠 +襤 +襦 +襪 +襯 +襲 +襴 +襶 +襻 +襾 +西 +要 +覃 +覆 +覇 +覈 +見 +覌 +規 +覓 +視 +覚 +覡 +覦 +覧 +親 +覬 +覲 +観 +覺 +覽 +覿 +觀 +角 +觔 +觙 +觚 +觜 +解 +觭 +觱 +觴 +觶 +觸 +觿 +言 +訁 +訂 +訃 +訇 +計 +訊 +訌 +討 +訏 +訐 +訒 +訓 +訔 +訕 +訖 +託 +記 +訛 +訝 +訟 +訣 +訥 +訪 +設 +許 +訴 +訶 +診 +註 +証 +訾 +詁 +詆 +詈 +詐 +詒 +詔 +評 +詛 +詞 +詠 +詡 +詢 +詣 +詥 +試 +詧 +詩 +詫 +詭 +詮 +詰 +話 +該 +詳 +詵 +詹 +詼 +誄 +誅 +誇 +誌 +認 +誒 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +誨 +說 +説 +読 +誰 +課 +誴 +誹 +誼 +誾 +調 +談 +請 +諍 +諏 +諒 +論 +諗 +諜 +諟 +諠 +諡 +諤 +諦 +諧 +諪 +諫 +諭 +諮 +諱 +諲 +諳 +諴 +諶 +諷 +諸 +諺 +諼 +諾 +謀 +謁 +謂 +謄 +謇 +謊 +謌 +謎 +謏 +謐 +謔 +謖 +謗 +謙 +謚 +講 +謜 +謝 +謠 +謢 +謤 +謨 +謩 +謫 +謬 +謳 +謹 +謾 +證 +譏 +譓 +譔 +識 +譙 +譚 +譜 +譞 +警 +譫 +譬 +譭 +譯 +議 +譲 +譳 +譴 +護 +譽 +譿 +讀 +讃 +變 +讌 +讎 +讓 +讖 +讙 +讚 +讜 +讞 +谷 +谿 +豁 +豆 +豇 +豈 +豉 +豊 +豌 +豎 +豐 +豔 +豕 +豚 +象 +豢 +豨 +豪 +豫 +豬 +豳 +豸 +豹 +豺 +豿 +貂 +貅 +貉 +貊 +貌 +貐 +貒 +貓 +貔 +貘 +貝 +貞 +負 +財 +貢 +貤 +貧 +貨 +販 +貪 +貫 +責 +貭 +貮 +貯 +貲 +貳 +貴 +貶 +買 +貸 +貺 +費 +貼 +貽 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賑 +賒 +賓 +賔 +賕 +賚 +賜 +賞 +賠 +賡 +賢 +賣 +賤 +賦 +賨 +質 +賬 +賭 +賴 +賹 +賺 +賻 +購 +賽 +賾 +贄 +贅 +贇 +贈 +贊 +贌 +贍 +贏 +贓 +贔 +贖 +贛 +赤 +赦 +赧 +赫 +赬 +赭 +走 +赳 +赴 +起 +趁 +超 +越 +趐 +趕 +趖 +趙 +趟 +趣 +趨 +足 +趴 +趵 +趺 +趼 +趾 +跅 +跆 +跋 +跌 +跏 +跑 +跖 +跗 +跛 +距 +跟 +跡 +跣 +跤 +跨 +跩 +跪 +路 +跳 +踎 +踏 +踐 +踝 +踞 +踢 +踩 +踰 +踴 +踹 +踺 +蹂 +蹄 +蹇 +蹈 +蹉 +蹊 +蹋 +蹕 +蹙 +蹟 +蹠 +蹤 +蹦 +蹬 +蹭 +蹯 +蹲 +蹴 +蹶 +蹺 +蹻 +蹼 +躁 +躂 +躄 +躉 +躋 +躍 +躑 +躒 +躔 +躝 +躪 +身 +躬 +躰 +躲 +躺 +軀 +車 +軋 +軌 +軍 +軎 +軒 +軔 +軛 +軟 +転 +軫 +軲 +軸 +軹 +軺 +軻 +軼 +軽 +軾 +較 +輄 +輅 +載 +輋 +輒 +輓 +輔 +輕 +輛 +輝 +輞 +輟 +輥 +輦 +輩 +輪 +輬 +輭 +輯 +輶 +輸 +輻 +輾 +輿 +轀 +轂 +轄 +轅 +轆 +轉 +轍 +轎 +轘 +轝 +轟 +轤 +辛 +辜 +辟 +辣 +辦 +辧 +辨 +辭 +辮 +辯 +辰 +辱 +農 +辵 +辺 +辻 +込 +迂 +迄 +迅 +迎 +近 +返 +迢 +迤 +迥 +迦 +迪 +迫 +迭 +迮 +述 +迴 +迵 +迷 +迸 +迺 +追 +退 +送 +逃 +逄 +逅 +逆 +逈 +逋 +逌 +逍 +逎 +透 +逐 +逑 +途 +逕 +逖 +逗 +這 +通 +逛 +逝 +逞 +速 +造 +逢 +連 +逤 +逨 +逮 +逯 +進 +逴 +逵 +逸 +逹 +逺 +逼 +逾 +遁 +遂 +遄 +遇 +遊 +運 +遍 +過 +遏 +遐 +遒 +道 +達 +違 +遘 +遙 +遛 +遜 +遞 +遠 +遢 +遣 +遨 +適 +遭 +遮 +遯 +遲 +遴 +遵 +遶 +遷 +選 +遹 +遺 +遼 +避 +邀 +邁 +邂 +邃 +還 +邇 +邈 +邉 +邊 +邋 +邏 +邑 +邕 +邗 +邙 +邛 +邠 +邡 +邢 +那 +邦 +邨 +邪 +邯 +邰 +邱 +邲 +邳 +邴 +邵 +邸 +邽 +邾 +郁 +郃 +郄 +郅 +郇 +郊 +郋 +郎 +郗 +郛 +郜 +郝 +郞 +郟 +郡 +郢 +郤 +部 +郪 +郫 +郭 +郯 +郳 +郴 +郵 +郷 +都 +郾 +郿 +鄂 +鄃 +鄄 +鄆 +鄉 +鄋 +鄑 +鄒 +鄔 +鄖 +鄗 +鄘 +鄙 +鄚 +鄜 +鄞 +鄠 +鄢 +鄣 +鄤 +鄧 +鄩 +鄫 +鄭 +鄯 +鄰 +鄱 +鄲 +鄳 +鄴 +鄺 +酃 +酆 +酈 +酉 +酊 +酋 +酌 +配 +酎 +酏 +酐 +酒 +酔 +酗 +酚 +酞 +酡 +酢 +酣 +酥 +酩 +酪 +酬 +酮 +酯 +酰 +酴 +酵 +酶 +酷 +酸 +酺 +酼 +醁 +醂 +醃 +醅 +醇 +醉 +醋 +醌 +醍 +醐 +醒 +醚 +醛 +醜 +醞 +醢 +醣 +醪 +醫 +醬 +醮 +醯 +醴 +醺 +醾 +醿 +釀 +釁 +釆 +采 +釉 +釋 +里 +重 +野 +量 +釐 +金 +釒 +釓 +釔 +釕 +釗 +釘 +釙 +釚 +釜 +針 +釣 +釤 +釦 +釧 +釩 +釪 +釭 +釴 +釵 +釷 +釹 +釺 +鈀 +鈁 +鈄 +鈇 +鈈 +鈉 +鈊 +鈍 +鈏 +鈐 +鈑 +鈔 +鈕 +鈖 +鈞 +鈢 +鈣 +鈥 +鈦 +鈫 +鈮 +鈰 +鈳 +鈴 +鈷 +鈸 +鈹 +鈺 +鈾 +鈿 +鉀 +鉄 +鉅 +鉆 +鉈 +鉉 +鉋 +鉌 +鉍 +鉏 +鉑 +鉓 +鉗 +鉚 +鉛 +鉞 +鉟 +鉤 +鉦 +鉬 +鉭 +鉲 +鉶 +鉷 +鉸 +鉻 +鉾 +鉿 +銀 +銂 +銃 +銅 +銋 +銍 +銑 +銓 +銕 +銖 +銘 +銚 +銜 +銠 +銣 +銥 +銦 +銨 +銩 +銪 +銫 +銬 +銭 +銱 +銲 +銳 +銶 +銷 +銹 +銻 +銼 +銾 +鋁 +鋅 +鋆 +鋇 +鋌 +鋏 +鋐 +鋒 +鋕 +鋗 +鋙 +鋡 +鋤 +鋥 +鋦 +鋨 +鋪 +鋮 +鋯 +鋰 +鋱 +鋳 +鋶 +鋸 +鋹 +鋼 +錀 +錄 +錏 +錐 +錒 +錕 +錘 +錚 +錞 +錟 +錠 +錡 +錢 +錦 +錨 +錫 +錬 +錮 +錯 +錳 +錶 +錸 +錻 +鍀 +鍇 +鍈 +鍉 +鍊 +鍋 +鍍 +鍏 +鍔 +鍘 +鍛 +鍝 +鍟 +鍠 +鍥 +鍩 +鍬 +鍱 +鍳 +鍵 +鍶 +鍷 +鍺 +鍼 +鍾 +鎂 +鎅 +鎊 +鎌 +鎏 +鎓 +鎔 +鎖 +鎗 +鎘 +鎚 +鎛 +鎢 +鎣 +鎦 +鎧 +鎪 +鎬 +鎭 +鎮 +鎰 +鎳 +鎵 +鎻 +鏃 +鏇 +鏈 +鏊 +鏌 +鏐 +鏑 +鏓 +鏖 +鏗 +鏘 +鏜 +鏝 +鏞 +鏟 +鏡 +鏢 +鏤 +鏦 +鏳 +鏴 +鏵 +鏷 +鏻 +鏽 +鐃 +鐇 +鐈 +鐓 +鐔 +鐘 +鐙 +鐠 +鐡 +鐤 +鐦 +鐧 +鐫 +鐬 +鐭 +鐮 +鐲 +鐳 +鐵 +鐸 +鐺 +鐽 +鐿 +鑀 +鑁 +鑂 +鑄 +鑅 +鑊 +鑌 +鑑 +鑒 +鑛 +鑠 +鑣 +鑨 +鑪 +鑫 +鑭 +鑰 +鑲 +鑴 +鑷 +鑼 +鑽 +鑾 +鑿 +長 +門 +閂 +閃 +閆 +閉 +開 +閎 +閏 +閑 +閒 +間 +閔 +閘 +閜 +閞 +閟 +関 +閣 +閥 +閦 +閨 +閩 +閬 +閭 +閰 +閱 +閶 +閹 +閻 +閼 +閾 +閿 +闆 +闇 +闈 +闊 +闋 +闌 +闍 +闐 +闓 +闔 +闕 +闖 +闘 +關 +闞 +闡 +闢 +闥 +阜 +阝 +阡 +阪 +阭 +阮 +阯 +阱 +防 +阻 +阿 +陀 +陁 +陂 +附 +陋 +陌 +降 +限 +陔 +陘 +陛 +陜 +陝 +陞 +陟 +陡 +院 +陣 +除 +陪 +陬 +陰 +陲 +陳 +陵 +陶 +陷 +陸 +険 +陽 +隄 +隅 +隆 +隈 +隊 +隋 +隍 +階 +隔 +隕 +隗 +隘 +隙 +際 +障 +隣 +隧 +隨 +險 +隰 +隱 +隲 +隳 +隴 +隷 +隸 +隹 +隻 +隼 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雋 +雌 +雍 +雎 +雑 +雒 +雕 +雖 +雙 +雛 +雜 +雝 +雞 +離 +難 +雨 +雩 +雪 +雫 +雯 +雱 +雲 +零 +雷 +雹 +電 +需 +霄 +霅 +霆 +震 +霈 +霉 +霊 +霍 +霎 +霏 +霑 +霓 +霖 +霙 +霜 +霞 +霤 +霧 +霨 +霰 +露 +霶 +霸 +霹 +霽 +霾 +靁 +靂 +靄 +靈 +靉 +靑 +青 +靖 +靚 +靛 +靜 +非 +靠 +靡 +面 +革 +靫 +靬 +靭 +靳 +靴 +靶 +靺 +靼 +鞅 +鞆 +鞋 +鞍 +鞏 +鞘 +鞞 +鞠 +鞣 +鞥 +鞦 +鞨 +鞭 +鞮 +鞴 +韁 +韃 +韆 +韋 +韌 +韑 +韓 +韙 +韜 +韞 +韠 +韡 +韭 +韮 +音 +韶 +韺 +韻 +韾 +響 +頁 +頂 +頃 +項 +順 +須 +頊 +頌 +頍 +頎 +頏 +預 +頑 +頒 +頓 +頔 +頗 +領 +頜 +頠 +頡 +頤 +頦 +頫 +頭 +頰 +頴 +頵 +頷 +頸 +頹 +頻 +頼 +顆 +題 +額 +顎 +顏 +顒 +顓 +顔 +顕 +顗 +願 +顙 +顛 +類 +顥 +顧 +顫 +顯 +顰 +顱 +顳 +顴 +風 +颮 +颯 +颱 +颶 +颺 +颼 +飄 +飆 +飈 +飛 +食 +飠 +飡 +飢 +飥 +飩 +飪 +飫 +飬 +飭 +飮 +飯 +飲 +飴 +飼 +飽 +飾 +餃 +餄 +餅 +餉 +養 +餌 +餎 +餐 +餒 +餓 +餗 +餘 +餚 +餛 +餞 +餠 +餡 +館 +餮 +餵 +餺 +餾 +餿 +饃 +饅 +饋 +饌 +饑 +饒 +饕 +饗 +饞 +饟 +饢 +首 +馗 +馘 +香 +馛 +馥 +馦 +馨 +馬 +馭 +馮 +馯 +馱 +馳 +馴 +馼 +駁 +駄 +駅 +駆 +駐 +駑 +駒 +駔 +駕 +駘 +駙 +駛 +駝 +駟 +駢 +駭 +駰 +駱 +駿 +騁 +騂 +騄 +騅 +騋 +騎 +騏 +験 +騖 +騙 +騤 +騨 +騫 +騭 +騮 +騰 +騶 +騷 +騾 +驁 +驃 +驄 +驅 +驊 +驌 +驍 +驎 +驒 +驕 +驗 +驚 +驛 +驟 +驢 +驤 +驥 +驩 +驪 +骨 +骯 +骰 +骶 +骷 +骸 +骼 +髀 +髂 +髎 +髏 +髑 +髒 +髓 +體 +高 +髙 +髡 +髦 +髪 +髭 +髮 +髯 +髲 +髷 +髹 +髻 +鬃 +鬄 +鬅 +鬆 +鬍 +鬚 +鬟 +鬢 +鬣 +鬥 +鬧 +鬨 +鬩 +鬪 +鬬 +鬮 +鬯 +鬱 +鬲 +鬹 +鬻 +鬼 +魁 +魂 +魃 +魄 +魅 +魈 +魋 +魍 +魎 +魏 +魔 +魕 +魘 +魚 +魛 +魞 +魟 +魣 +魨 +魩 +魮 +魯 +魴 +魷 +鮀 +鮁 +鮃 +鮄 +鮊 +鮋 +鮍 +鮐 +鮑 +鮒 +鮓 +鮗 +鮜 +鮟 +鮠 +鮡 +鮣 +鮨 +鮪 +鮫 +鮭 +鮮 +鮰 +鮸 +鮹 +鮻 +鯀 +鯁 +鯃 +鯇 +鯉 +鯊 +鯏 +鯒 +鯓 +鯔 +鯕 +鯖 +鯗 +鯙 +鯛 +鯡 +鯢 +鯤 +鯧 +鯨 +鯪 +鯭 +鯮 +鯰 +鯶 +鯷 +鯻 +鯽 +鯿 +鰂 +鰃 +鰆 +鰈 +鰉 +鰍 +鰏 +鰒 +鰓 +鰕 +鰗 +鰛 +鰜 +鰟 +鰣 +鰤 +鰧 +鰨 +鰩 +鰭 +鰮 +鰱 +鰲 +鰳 +鰶 +鰷 +鰹 +鰺 +鰻 +鰼 +鰾 +鱀 +鱂 +鱅 +鱇 +鱈 +鱉 +鱊 +鱒 +鱓 +鱔 +鱖 +鱗 +鱘 +鱚 +鱝 +鱟 +鱠 +鱣 +鱥 +鱧 +鱨 +鱬 +鱮 +鱰 +鱲 +鱵 +鱷 +鱸 +鱺 +鱻 +鳥 +鳧 +鳩 +鳯 +鳰 +鳳 +鳴 +鳶 +鳽 +鴆 +鴇 +鴉 +鴒 +鴓 +鴕 +鴗 +鴛 +鴝 +鴞 +鴟 +鴡 +鴣 +鴦 +鴨 +鴫 +鴯 +鴰 +鴴 +鴻 +鴿 +鵂 +鵄 +鵎 +鵐 +鵑 +鵒 +鵓 +鵙 +鵜 +鵝 +鵞 +鵟 +鵠 +鵡 +鵪 +鵬 +鵯 +鵰 +鵲 +鵵 +鵼 +鵾 +鶆 +鶇 +鶉 +鶏 +鶒 +鶓 +鶘 +鶚 +鶡 +鶥 +鶩 +鶬 +鶯 +鶲 +鶴 +鶹 +鶺 +鶻 +鶼 +鶿 +鷂 +鷄 +鷉 +鷎 +鷓 +鷗 +鷙 +鷚 +鷟 +鷥 +鷦 +鷫 +鷯 +鷲 +鷳 +鷸 +鷹 +鷺 +鸊 +鸌 +鸐 +鸑 +鸕 +鸘 +鸚 +鸛 +鸜 +鸝 +鸞 +鹮 +鹵 +鹹 +鹼 +鹽 +鹿 +麂 +麅 +麇 +麈 +麊 +麋 +麐 +麒 +麓 +麗 +麝 +麞 +麟 +麥 +麩 +麪 +麯 +麴 +麵 +麹 +麺 +麻 +麼 +麽 +麾 +麿 +黁 +黃 +黇 +黌 +黍 +黎 +黏 +黐 +黑 +黒 +黔 +默 +黙 +黛 +黜 +黝 +點 +黟 +黥 +黧 +黨 +黯 +黴 +黶 +黻 +黼 +黽 +黿 +鼂 +鼇 +鼈 +鼉 +鼎 +鼐 +鼒 +鼓 +鼕 +鼙 +鼠 +鼢 +鼩 +鼬 +鼯 +鼱 +鼴 +鼷 +鼻 +鼽 +鼾 +齊 +齋 +齒 +齕 +齡 +齣 +齦 +齧 +齲 +齶 +龍 +龎 +龐 +龑 +龔 +龕 +龜 +龝 +龠 +龢 +郎 +凉 +﹑ +﹗ +﹝ +﹞ +﹢ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +K +L +M +N +O +P +R +S +T +U +V +W +Y +Z +[ +] +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +r +s +t +u +z +{ +| +} +~ +¥ +𣇉 + diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b6f66494d5417e18bbd225719aa72690e09e126 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt @@ -0,0 +1,163 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +Ё +Є +І +Ј +Љ +Ў +А +Б +В +Г +Д +Е +Ж +З +И +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ы +Ь +Э +Ю +Я +а +б +в +г +д +е +ж +з +и +й +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +щ +ъ +ы +ь +э +ю +я +ё +ђ +є +і +ј +љ +њ +ћ +ў +џ +Ґ +ґ diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..f55923061bfd480b875bb3679d7a75a9157387a9 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt @@ -0,0 +1,167 @@ + +! +# +$ +% +& +' +( ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +É +é +ँ +ं +ः +अ +आ +इ +ई +उ +ऊ +ऋ +ए +ऐ +ऑ +ओ +औ +क +ख +ग +घ +ङ +च +छ +ज +झ +ञ +ट +ठ +ड +ढ +ण +त +थ +द +ध +न +ऩ +प +फ +ब +भ +म +य +र +ऱ +ल +ळ +व +श +ष +स +ह +़ +ा +ि +ी +ु +ू +ृ +ॅ +े +ै +ॉ +ो +ौ +् +॒ +क़ +ख़ +ग़ +ज़ +ड़ +ढ़ +फ़ +ॠ +। +० +१ +२ +३ +४ +५ +६ +७ +८ +९ +॰ diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..7677d31b9d3f08eef2823c2cf051beeab1f0470b --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt @@ -0,0 +1,95 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ + diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..339d4b89e5159a346636641a0814874faa59754a --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt @@ -0,0 +1,4399 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +© +° +² +´ +½ +Á +Ä +Å +Ç +È +É +Í +Ó +Ö +× +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +í +ð +ñ +ò +ó +ô +õ +ö +ø +ú +û +ü +ý +ā +ă +ą +ć +Č +č +đ +ē +ė +ę +ğ +ī +ı +Ł +ł +ń +ň +ō +ř +Ş +ş +Š +š +ţ +ū +ż +Ž +ž +Ș +ș +ț +Δ +α +λ +μ +φ +Г +О +а +в +л +о +р +с +т +я +ồ +​ +— +― +’ +“ +” +… +℃ +→ +∇ +− +■ +☆ +  +、 +。 +々 +〆 +〈 +〉 +「 +」 +『 +』 +〔 +〕 +〜 +ぁ +あ +ぃ +い +う +ぇ +え +ぉ +お +か +が +き +ぎ +く +ぐ +け +げ +こ +ご +さ +ざ +し +じ +す +ず +せ +ぜ +そ +ぞ +た +だ +ち +ぢ +っ +つ +づ +て +で +と +ど +な +に +ぬ +ね +の +は +ば +ぱ +ひ +び +ぴ +ふ +ぶ +ぷ +へ +べ +ぺ +ほ +ぼ +ぽ +ま +み +む +め +も +ゃ +や +ゅ +ゆ +ょ +よ +ら +り +る +れ +ろ +わ +ゑ +を +ん +ゝ +ゞ +ァ +ア +ィ +イ +ゥ +ウ +ェ +エ +ォ +オ +カ +ガ +キ +ギ +ク +グ +ケ +ゲ +コ +ゴ +サ +ザ +シ +ジ +ス +ズ +セ +ゼ +ソ +ゾ +タ +ダ +チ +ヂ +ッ +ツ +ヅ +テ +デ +ト +ド +ナ +ニ +ヌ +ネ +ノ +ハ +バ +パ +ヒ +ビ +ピ +フ +ブ +プ +ヘ +ベ +ペ +ホ +ボ +ポ +マ +ミ +ム +メ +モ +ャ +ヤ +ュ +ユ +ョ +ヨ +ラ +リ +ル +レ +ロ +ワ +ヰ +ン +ヴ +ヵ +ヶ +・ +ー +㈱ +一 +丁 +七 +万 +丈 +三 +上 +下 +不 +与 +丑 +且 +世 +丘 +丙 +丞 +両 +並 +中 +串 +丸 +丹 +主 +丼 +丿 +乃 +久 +之 +乎 +乏 +乗 +乘 +乙 +九 +乞 +也 +乱 +乳 +乾 +亀 +了 +予 +争 +事 +二 +于 +互 +五 +井 +亘 +亙 +些 +亜 +亟 +亡 +交 +亥 +亦 +亨 +享 +京 +亭 +亮 +人 +什 +仁 +仇 +今 +介 +仍 +仏 +仔 +仕 +他 +仗 +付 +仙 +代 +令 +以 +仮 +仰 +仲 +件 +任 +企 +伊 +伍 +伎 +伏 +伐 +休 +会 +伝 +伯 +估 +伴 +伶 +伸 +伺 +似 +伽 +佃 +但 +位 +低 +住 +佐 +佑 +体 +何 +余 +佚 +佛 +作 +佩 +佳 +併 +佶 +使 +侈 +例 +侍 +侏 +侑 +侘 +供 +依 +侠 +価 +侮 +侯 +侵 +侶 +便 +係 +促 +俄 +俊 +俔 +俗 +俘 +保 +信 +俣 +俤 +修 +俯 +俳 +俵 +俸 +俺 +倉 +個 +倍 +倒 +候 +借 +倣 +値 +倫 +倭 +倶 +倹 +偃 +假 +偈 +偉 +偏 +偐 +偕 +停 +健 +側 +偵 +偶 +偽 +傀 +傅 +傍 +傑 +傘 +備 +催 +傭 +傲 +傳 +債 +傷 +傾 +僊 +働 +像 +僑 +僕 +僚 +僧 +僭 +僮 +儀 +億 +儇 +儒 +儛 +償 +儡 +優 +儲 +儺 +儼 +兀 +允 +元 +兄 +充 +兆 +先 +光 +克 +兌 +免 +兎 +児 +党 +兜 +入 +全 +八 +公 +六 +共 +兵 +其 +具 +典 +兼 +内 +円 +冊 +再 +冑 +冒 +冗 +写 +冠 +冤 +冥 +冨 +冬 +冲 +决 +冶 +冷 +准 +凉 +凋 +凌 +凍 +凛 +凝 +凞 +几 +凡 +処 +凪 +凰 +凱 +凶 +凸 +凹 +出 +函 +刀 +刃 +分 +切 +刈 +刊 +刎 +刑 +列 +初 +判 +別 +利 +刪 +到 +制 +刷 +券 +刹 +刺 +刻 +剃 +則 +削 +剋 +前 +剖 +剛 +剣 +剤 +剥 +剪 +副 +剰 +割 +創 +剽 +劇 +劉 +劔 +力 +功 +加 +劣 +助 +努 +劫 +劭 +励 +労 +効 +劾 +勃 +勅 +勇 +勉 +勒 +動 +勘 +務 +勝 +募 +勢 +勤 +勧 +勲 +勺 +勾 +勿 +匁 +匂 +包 +匏 +化 +北 +匙 +匝 +匠 +匡 +匣 +匯 +匲 +匹 +区 +医 +匿 +十 +千 +升 +午 +卉 +半 +卍 +卑 +卒 +卓 +協 +南 +単 +博 +卜 +占 +卦 +卯 +印 +危 +即 +却 +卵 +卸 +卿 +厄 +厚 +原 +厠 +厨 +厩 +厭 +厳 +去 +参 +又 +叉 +及 +友 +双 +反 +収 +叔 +取 +受 +叙 +叛 +叟 +叡 +叢 +口 +古 +句 +叩 +只 +叫 +召 +可 +台 +叱 +史 +右 +叶 +号 +司 +吃 +各 +合 +吉 +吊 +同 +名 +后 +吏 +吐 +向 +君 +吝 +吟 +吠 +否 +含 +吸 +吹 +吻 +吽 +吾 +呂 +呆 +呈 +呉 +告 +呑 +周 +呪 +呰 +味 +呼 +命 +咀 +咄 +咋 +和 +咒 +咫 +咲 +咳 +咸 +哀 +品 +哇 +哉 +員 +哨 +哩 +哭 +哲 +哺 +唄 +唆 +唇 +唐 +唖 +唯 +唱 +唳 +唸 +唾 +啄 +商 +問 +啓 +啼 +善 +喋 +喚 +喜 +喝 +喧 +喩 +喪 +喫 +喬 +單 +喰 +営 +嗅 +嗇 +嗔 +嗚 +嗜 +嗣 +嘆 +嘉 +嘗 +嘘 +嘩 +嘯 +嘱 +嘲 +嘴 +噂 +噌 +噛 +器 +噴 +噺 +嚆 +嚢 +囀 +囃 +囉 +囚 +四 +回 +因 +団 +困 +囲 +図 +固 +国 +圀 +圃 +國 +圏 +園 +圓 +團 +圜 +土 +圧 +在 +圭 +地 +址 +坂 +均 +坊 +坐 +坑 +坡 +坤 +坦 +坪 +垂 +型 +垢 +垣 +埃 +埋 +城 +埒 +埔 +域 +埠 +埴 +埵 +執 +培 +基 +埼 +堀 +堂 +堅 +堆 +堕 +堤 +堪 +堯 +堰 +報 +場 +堵 +堺 +塀 +塁 +塊 +塑 +塔 +塗 +塘 +塙 +塚 +塞 +塩 +填 +塵 +塾 +境 +墉 +墓 +増 +墜 +墟 +墨 +墳 +墺 +墻 +墾 +壁 +壇 +壊 +壌 +壕 +士 +壬 +壮 +声 +壱 +売 +壷 +壹 +壺 +壽 +変 +夏 +夕 +外 +夙 +多 +夜 +夢 +夥 +大 +天 +太 +夫 +夬 +夭 +央 +失 +夷 +夾 +奄 +奇 +奈 +奉 +奎 +奏 +契 +奔 +奕 +套 +奘 +奠 +奢 +奥 +奨 +奪 +奮 +女 +奴 +奸 +好 +如 +妃 +妄 +妊 +妍 +妓 +妖 +妙 +妥 +妨 +妬 +妲 +妹 +妻 +妾 +姉 +始 +姐 +姓 +委 +姚 +姜 +姞 +姥 +姦 +姨 +姪 +姫 +姶 +姻 +姿 +威 +娑 +娘 +娟 +娠 +娩 +娯 +娼 +婆 +婉 +婚 +婢 +婦 +婬 +婿 +媄 +媒 +媓 +媚 +媛 +媞 +媽 +嫁 +嫄 +嫉 +嫌 +嫐 +嫗 +嫡 +嬉 +嬌 +嬢 +嬪 +嬬 +嬾 +孁 +子 +孔 +字 +存 +孚 +孝 +孟 +季 +孤 +学 +孫 +孵 +學 +宅 +宇 +守 +安 +宋 +完 +宍 +宏 +宕 +宗 +官 +宙 +定 +宛 +宜 +宝 +実 +客 +宣 +室 +宥 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宿 +寂 +寄 +寅 +密 +寇 +富 +寒 +寓 +寔 +寛 +寝 +察 +寡 +實 +寧 +審 +寮 +寵 +寶 +寸 +寺 +対 +寿 +封 +専 +射 +将 +尉 +尊 +尋 +對 +導 +小 +少 +尖 +尚 +尤 +尪 +尭 +就 +尹 +尺 +尻 +尼 +尽 +尾 +尿 +局 +居 +屈 +届 +屋 +屍 +屎 +屏 +屑 +屓 +展 +属 +屠 +層 +履 +屯 +山 +岐 +岑 +岡 +岩 +岫 +岬 +岳 +岷 +岸 +峠 +峡 +峨 +峯 +峰 +島 +峻 +崇 +崋 +崎 +崑 +崖 +崗 +崛 +崩 +嵌 +嵐 +嵩 +嵯 +嶂 +嶋 +嶠 +嶺 +嶼 +嶽 +巀 +巌 +巒 +巖 +川 +州 +巡 +巣 +工 +左 +巧 +巨 +巫 +差 +己 +巳 +巴 +巷 +巻 +巽 +巾 +市 +布 +帆 +希 +帖 +帚 +帛 +帝 +帥 +師 +席 +帯 +帰 +帳 +帷 +常 +帽 +幄 +幅 +幇 +幌 +幔 +幕 +幟 +幡 +幢 +幣 +干 +平 +年 +并 +幸 +幹 +幻 +幼 +幽 +幾 +庁 +広 +庄 +庇 +床 +序 +底 +庖 +店 +庚 +府 +度 +座 +庫 +庭 +庵 +庶 +康 +庸 +廂 +廃 +廉 +廊 +廓 +廟 +廠 +廣 +廬 +延 +廷 +建 +廻 +廼 +廿 +弁 +弄 +弉 +弊 +弌 +式 +弐 +弓 +弔 +引 +弖 +弗 +弘 +弛 +弟 +弥 +弦 +弧 +弱 +張 +強 +弼 +弾 +彈 +彊 +彌 +彎 +当 +彗 +彙 +彝 +形 +彦 +彩 +彫 +彬 +彭 +彰 +影 +彷 +役 +彼 +往 +征 +徂 +径 +待 +律 +後 +徐 +徑 +徒 +従 +得 +徠 +御 +徧 +徨 +復 +循 +徭 +微 +徳 +徴 +德 +徹 +徽 +心 +必 +忉 +忌 +忍 +志 +忘 +忙 +応 +忠 +快 +忯 +念 +忻 +忽 +忿 +怒 +怖 +思 +怠 +怡 +急 +性 +怨 +怪 +怯 +恂 +恋 +恐 +恒 +恕 +恣 +恤 +恥 +恨 +恩 +恬 +恭 +息 +恵 +悉 +悌 +悍 +悔 +悟 +悠 +患 +悦 +悩 +悪 +悲 +悼 +情 +惇 +惑 +惚 +惜 +惟 +惠 +惣 +惧 +惨 +惰 +想 +惹 +惺 +愈 +愉 +愍 +意 +愔 +愚 +愛 +感 +愷 +愿 +慈 +態 +慌 +慎 +慕 +慢 +慣 +慧 +慨 +慮 +慰 +慶 +憂 +憎 +憐 +憑 +憙 +憤 +憧 +憩 +憬 +憲 +憶 +憾 +懇 +應 +懌 +懐 +懲 +懸 +懺 +懽 +懿 +戈 +戊 +戌 +戎 +成 +我 +戒 +戔 +或 +戚 +戟 +戦 +截 +戮 +戯 +戴 +戸 +戻 +房 +所 +扁 +扇 +扈 +扉 +手 +才 +打 +払 +托 +扮 +扱 +扶 +批 +承 +技 +抄 +把 +抑 +抓 +投 +抗 +折 +抜 +択 +披 +抱 +抵 +抹 +押 +抽 +担 +拇 +拈 +拉 +拍 +拏 +拐 +拒 +拓 +拘 +拙 +招 +拝 +拠 +拡 +括 +拭 +拳 +拵 +拶 +拾 +拿 +持 +挂 +指 +按 +挑 +挙 +挟 +挨 +振 +挺 +挽 +挿 +捉 +捕 +捗 +捜 +捧 +捨 +据 +捺 +捻 +掃 +掄 +授 +掌 +排 +掖 +掘 +掛 +掟 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掲 +掴 +掻 +掾 +揃 +揄 +揆 +揉 +描 +提 +揖 +揚 +換 +握 +揮 +援 +揶 +揺 +損 +搦 +搬 +搭 +携 +搾 +摂 +摘 +摩 +摸 +摺 +撃 +撒 +撞 +撤 +撥 +撫 +播 +撮 +撰 +撲 +撹 +擁 +操 +擔 +擦 +擬 +擾 +攘 +攝 +攣 +支 +收 +改 +攻 +放 +政 +故 +敏 +救 +敗 +教 +敢 +散 +敦 +敬 +数 +整 +敵 +敷 +斂 +文 +斉 +斎 +斐 +斑 +斗 +料 +斜 +斟 +斤 +斥 +斧 +斬 +断 +斯 +新 +方 +於 +施 +旁 +旅 +旋 +旌 +族 +旗 +旛 +无 +旡 +既 +日 +旦 +旧 +旨 +早 +旬 +旭 +旺 +旻 +昂 +昆 +昇 +昉 +昌 +明 +昏 +易 +昔 +星 +映 +春 +昧 +昨 +昪 +昭 +是 +昵 +昼 +晁 +時 +晃 +晋 +晏 +晒 +晟 +晦 +晧 +晩 +普 +景 +晴 +晶 +智 +暁 +暇 +暈 +暉 +暑 +暖 +暗 +暘 +暢 +暦 +暫 +暮 +暲 +暴 +暹 +暾 +曄 +曇 +曉 +曖 +曙 +曜 +曝 +曠 +曰 +曲 +曳 +更 +書 +曹 +曼 +曽 +曾 +替 +最 +會 +月 +有 +朋 +服 +朏 +朔 +朕 +朗 +望 +朝 +期 +朧 +木 +未 +末 +本 +札 +朱 +朴 +机 +朽 +杁 +杉 +李 +杏 +材 +村 +杓 +杖 +杜 +杞 +束 +条 +杢 +杣 +来 +杭 +杮 +杯 +東 +杲 +杵 +杷 +杼 +松 +板 +枅 +枇 +析 +枓 +枕 +林 +枚 +果 +枝 +枠 +枡 +枢 +枯 +枳 +架 +柄 +柊 +柏 +某 +柑 +染 +柔 +柘 +柚 +柯 +柱 +柳 +柴 +柵 +査 +柾 +柿 +栂 +栃 +栄 +栖 +栗 +校 +株 +栲 +栴 +核 +根 +栻 +格 +栽 +桁 +桂 +桃 +框 +案 +桐 +桑 +桓 +桔 +桜 +桝 +桟 +桧 +桴 +桶 +桾 +梁 +梅 +梆 +梓 +梔 +梗 +梛 +條 +梟 +梢 +梧 +梨 +械 +梱 +梲 +梵 +梶 +棄 +棋 +棒 +棗 +棘 +棚 +棟 +棠 +森 +棲 +棹 +棺 +椀 +椅 +椋 +植 +椎 +椏 +椒 +椙 +検 +椥 +椹 +椿 +楊 +楓 +楕 +楚 +楞 +楠 +楡 +楢 +楨 +楪 +楫 +業 +楮 +楯 +楳 +極 +楷 +楼 +楽 +概 +榊 +榎 +榕 +榛 +榜 +榮 +榱 +榴 +槃 +槇 +槊 +構 +槌 +槍 +槐 +様 +槙 +槻 +槽 +槿 +樂 +樋 +樓 +樗 +標 +樟 +模 +権 +横 +樫 +樵 +樹 +樺 +樽 +橇 +橋 +橘 +機 +橿 +檀 +檄 +檎 +檐 +檗 +檜 +檣 +檥 +檬 +檮 +檸 +檻 +櫃 +櫓 +櫛 +櫟 +櫨 +櫻 +欄 +欅 +欠 +次 +欣 +欧 +欲 +欺 +欽 +款 +歌 +歎 +歓 +止 +正 +此 +武 +歩 +歪 +歯 +歳 +歴 +死 +殆 +殉 +殊 +残 +殖 +殯 +殴 +段 +殷 +殺 +殻 +殿 +毀 +毅 +母 +毎 +毒 +比 +毘 +毛 +毫 +毬 +氈 +氏 +民 +気 +水 +氷 +永 +氾 +汀 +汁 +求 +汎 +汐 +汗 +汚 +汝 +江 +池 +汪 +汰 +汲 +決 +汽 +沂 +沃 +沅 +沆 +沈 +沌 +沐 +沓 +沖 +沙 +没 +沢 +沱 +河 +沸 +油 +治 +沼 +沽 +沿 +況 +泉 +泊 +泌 +法 +泗 +泡 +波 +泣 +泥 +注 +泯 +泰 +泳 +洋 +洒 +洗 +洛 +洞 +津 +洩 +洪 +洲 +洸 +洹 +活 +洽 +派 +流 +浄 +浅 +浙 +浚 +浜 +浣 +浦 +浩 +浪 +浮 +浴 +海 +浸 +涅 +消 +涌 +涙 +涛 +涯 +液 +涵 +涼 +淀 +淄 +淆 +淇 +淋 +淑 +淘 +淡 +淤 +淨 +淫 +深 +淳 +淵 +混 +淹 +添 +清 +済 +渉 +渋 +渓 +渕 +渚 +減 +渟 +渠 +渡 +渤 +渥 +渦 +温 +渫 +測 +港 +游 +渾 +湊 +湖 +湘 +湛 +湧 +湫 +湯 +湾 +湿 +満 +源 +準 +溜 +溝 +溢 +溥 +溪 +溶 +溺 +滄 +滅 +滋 +滌 +滑 +滕 +滝 +滞 +滴 +滸 +滹 +滿 +漁 +漂 +漆 +漉 +漏 +漑 +演 +漕 +漠 +漢 +漣 +漫 +漬 +漱 +漸 +漿 +潅 +潔 +潙 +潜 +潟 +潤 +潭 +潮 +潰 +潴 +澁 +澂 +澄 +澎 +澗 +澤 +澪 +澱 +澳 +激 +濁 +濃 +濟 +濠 +濡 +濤 +濫 +濯 +濱 +濾 +瀉 +瀋 +瀑 +瀕 +瀞 +瀟 +瀧 +瀬 +瀾 +灌 +灑 +灘 +火 +灯 +灰 +灸 +災 +炉 +炊 +炎 +炒 +炭 +炮 +炷 +点 +為 +烈 +烏 +烙 +烝 +烹 +焔 +焙 +焚 +無 +焦 +然 +焼 +煇 +煉 +煌 +煎 +煕 +煙 +煤 +煥 +照 +煩 +煬 +煮 +煽 +熈 +熊 +熙 +熟 +熨 +熱 +熹 +熾 +燃 +燈 +燎 +燔 +燕 +燗 +燥 +燭 +燻 +爆 +爐 +爪 +爬 +爲 +爵 +父 +爺 +爼 +爽 +爾 +片 +版 +牌 +牒 +牘 +牙 +牛 +牝 +牟 +牡 +牢 +牧 +物 +牲 +特 +牽 +犂 +犠 +犬 +犯 +状 +狂 +狄 +狐 +狗 +狙 +狛 +狡 +狩 +独 +狭 +狷 +狸 +狼 +猊 +猛 +猟 +猥 +猨 +猩 +猪 +猫 +献 +猴 +猶 +猷 +猾 +猿 +獄 +獅 +獏 +獣 +獲 +玄 +玅 +率 +玉 +王 +玖 +玩 +玲 +珀 +珂 +珈 +珉 +珊 +珍 +珎 +珞 +珠 +珣 +珥 +珪 +班 +現 +球 +理 +琉 +琢 +琥 +琦 +琮 +琲 +琳 +琴 +琵 +琶 +瑁 +瑋 +瑙 +瑚 +瑛 +瑜 +瑞 +瑠 +瑤 +瑩 +瑪 +瑳 +瑾 +璃 +璋 +璜 +璞 +璧 +璨 +環 +璵 +璽 +璿 +瓊 +瓔 +瓜 +瓢 +瓦 +瓶 +甍 +甑 +甕 +甘 +甚 +甞 +生 +産 +甥 +用 +甫 +田 +由 +甲 +申 +男 +町 +画 +界 +畏 +畑 +畔 +留 +畜 +畝 +畠 +畢 +略 +番 +異 +畳 +當 +畷 +畸 +畺 +畿 +疆 +疇 +疋 +疎 +疏 +疑 +疫 +疱 +疲 +疹 +疼 +疾 +病 +症 +痒 +痔 +痕 +痘 +痙 +痛 +痢 +痩 +痴 +痺 +瘍 +瘡 +瘧 +療 +癇 +癌 +癒 +癖 +癡 +癪 +発 +登 +白 +百 +的 +皆 +皇 +皋 +皐 +皓 +皮 +皺 +皿 +盂 +盃 +盆 +盈 +益 +盒 +盗 +盛 +盞 +盟 +盡 +監 +盤 +盥 +盧 +目 +盲 +直 +相 +盾 +省 +眉 +看 +県 +眞 +真 +眠 +眷 +眺 +眼 +着 +睡 +督 +睦 +睨 +睿 +瞋 +瞑 +瞞 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞿 +矍 +矛 +矜 +矢 +知 +矧 +矩 +短 +矮 +矯 +石 +砂 +砌 +研 +砕 +砥 +砦 +砧 +砲 +破 +砺 +硝 +硫 +硬 +硯 +碁 +碇 +碌 +碑 +碓 +碕 +碗 +碣 +碧 +碩 +確 +碾 +磁 +磐 +磔 +磧 +磨 +磬 +磯 +礁 +礎 +礒 +礙 +礫 +礬 +示 +礼 +社 +祀 +祁 +祇 +祈 +祉 +祐 +祓 +祕 +祖 +祗 +祚 +祝 +神 +祟 +祠 +祢 +祥 +票 +祭 +祷 +祺 +禁 +禄 +禅 +禊 +禍 +禎 +福 +禔 +禖 +禛 +禦 +禧 +禮 +禰 +禹 +禽 +禿 +秀 +私 +秋 +科 +秒 +秘 +租 +秤 +秦 +秩 +称 +移 +稀 +程 +税 +稔 +稗 +稙 +稚 +稜 +稠 +種 +稱 +稲 +稷 +稻 +稼 +稽 +稿 +穀 +穂 +穆 +積 +穎 +穏 +穗 +穜 +穢 +穣 +穫 +穴 +究 +空 +突 +窃 +窄 +窒 +窓 +窟 +窠 +窩 +窪 +窮 +窯 +竃 +竄 +竈 +立 +站 +竜 +竝 +竟 +章 +童 +竪 +竭 +端 +竴 +競 +竹 +竺 +竽 +竿 +笄 +笈 +笏 +笑 +笙 +笛 +笞 +笠 +笥 +符 +第 +笹 +筅 +筆 +筇 +筈 +等 +筋 +筌 +筍 +筏 +筐 +筑 +筒 +答 +策 +筝 +筥 +筧 +筬 +筮 +筯 +筰 +筵 +箆 +箇 +箋 +箏 +箒 +箔 +箕 +算 +箙 +箜 +管 +箪 +箭 +箱 +箸 +節 +篁 +範 +篆 +篇 +築 +篋 +篌 +篝 +篠 +篤 +篥 +篦 +篩 +篭 +篳 +篷 +簀 +簒 +簡 +簧 +簪 +簫 +簺 +簾 +簿 +籀 +籃 +籌 +籍 +籐 +籟 +籠 +籤 +籬 +米 +籾 +粂 +粉 +粋 +粒 +粕 +粗 +粘 +粛 +粟 +粥 +粧 +粮 +粳 +精 +糊 +糖 +糜 +糞 +糟 +糠 +糧 +糯 +糸 +糺 +系 +糾 +紀 +約 +紅 +紋 +納 +紐 +純 +紗 +紘 +紙 +級 +紛 +素 +紡 +索 +紫 +紬 +累 +細 +紳 +紵 +紹 +紺 +絁 +終 +絃 +組 +絅 +経 +結 +絖 +絞 +絡 +絣 +給 +統 +絲 +絵 +絶 +絹 +絽 +綏 +經 +継 +続 +綜 +綟 +綬 +維 +綱 +網 +綴 +綸 +綺 +綽 +綾 +綿 +緊 +緋 +総 +緑 +緒 +線 +締 +緥 +編 +緩 +緬 +緯 +練 +緻 +縁 +縄 +縅 +縒 +縛 +縞 +縢 +縣 +縦 +縫 +縮 +縹 +總 +績 +繁 +繊 +繋 +繍 +織 +繕 +繝 +繦 +繧 +繰 +繹 +繼 +纂 +纈 +纏 +纐 +纒 +纛 +缶 +罔 +罠 +罧 +罪 +置 +罰 +署 +罵 +罷 +罹 +羂 +羅 +羆 +羇 +羈 +羊 +羌 +美 +群 +羨 +義 +羯 +羲 +羹 +羽 +翁 +翅 +翌 +習 +翔 +翛 +翠 +翡 +翫 +翰 +翺 +翻 +翼 +耀 +老 +考 +者 +耆 +而 +耐 +耕 +耗 +耨 +耳 +耶 +耽 +聊 +聖 +聘 +聚 +聞 +聟 +聡 +聨 +聯 +聰 +聲 +聴 +職 +聾 +肄 +肆 +肇 +肉 +肋 +肌 +肖 +肘 +肛 +肝 +股 +肢 +肥 +肩 +肪 +肯 +肱 +育 +肴 +肺 +胃 +胆 +背 +胎 +胖 +胚 +胝 +胞 +胡 +胤 +胱 +胴 +胸 +能 +脂 +脅 +脆 +脇 +脈 +脊 +脚 +脛 +脩 +脱 +脳 +腋 +腎 +腐 +腑 +腔 +腕 +腫 +腰 +腱 +腸 +腹 +腺 +腿 +膀 +膏 +膚 +膜 +膝 +膠 +膣 +膨 +膩 +膳 +膵 +膾 +膿 +臂 +臆 +臈 +臍 +臓 +臘 +臚 +臣 +臥 +臨 +自 +臭 +至 +致 +臺 +臼 +舂 +舅 +與 +興 +舌 +舍 +舎 +舒 +舖 +舗 +舘 +舜 +舞 +舟 +舩 +航 +般 +舳 +舶 +船 +艇 +艘 +艦 +艮 +良 +色 +艶 +芋 +芒 +芙 +芝 +芥 +芦 +芬 +芭 +芯 +花 +芳 +芸 +芹 +芻 +芽 +芿 +苅 +苑 +苔 +苗 +苛 +苞 +苡 +若 +苦 +苧 +苫 +英 +苴 +苻 +茂 +范 +茄 +茅 +茎 +茗 +茘 +茜 +茨 +茲 +茵 +茶 +茸 +茹 +草 +荊 +荏 +荒 +荘 +荷 +荻 +荼 +莞 +莪 +莫 +莬 +莱 +莵 +莽 +菅 +菊 +菌 +菓 +菖 +菘 +菜 +菟 +菩 +菫 +華 +菱 +菴 +萄 +萊 +萌 +萍 +萎 +萠 +萩 +萬 +萱 +落 +葉 +著 +葛 +葡 +董 +葦 +葩 +葬 +葭 +葱 +葵 +葺 +蒋 +蒐 +蒔 +蒙 +蒟 +蒡 +蒲 +蒸 +蒻 +蒼 +蒿 +蓄 +蓆 +蓉 +蓋 +蓑 +蓬 +蓮 +蓼 +蔀 +蔑 +蔓 +蔚 +蔡 +蔦 +蔬 +蔭 +蔵 +蔽 +蕃 +蕉 +蕊 +蕎 +蕨 +蕩 +蕪 +蕭 +蕾 +薄 +薇 +薊 +薔 +薗 +薙 +薛 +薦 +薨 +薩 +薪 +薫 +薬 +薭 +薮 +藁 +藉 +藍 +藏 +藐 +藝 +藤 +藩 +藪 +藷 +藹 +藺 +藻 +蘂 +蘆 +蘇 +蘊 +蘭 +虎 +虐 +虔 +虚 +虜 +虞 +號 +虫 +虹 +虻 +蚊 +蚕 +蛇 +蛉 +蛍 +蛎 +蛙 +蛛 +蛟 +蛤 +蛭 +蛮 +蛸 +蛹 +蛾 +蜀 +蜂 +蜃 +蜆 +蜊 +蜘 +蜜 +蜷 +蜻 +蝉 +蝋 +蝕 +蝙 +蝠 +蝦 +蝶 +蝿 +螂 +融 +螣 +螺 +蟄 +蟇 +蟠 +蟷 +蟹 +蟻 +蠢 +蠣 +血 +衆 +行 +衍 +衒 +術 +街 +衙 +衛 +衝 +衞 +衡 +衢 +衣 +表 +衫 +衰 +衵 +衷 +衽 +衾 +衿 +袁 +袈 +袋 +袍 +袒 +袖 +袙 +袞 +袢 +被 +袰 +袱 +袴 +袷 +袿 +裁 +裂 +裃 +装 +裏 +裔 +裕 +裘 +裙 +補 +裟 +裡 +裲 +裳 +裴 +裸 +裹 +製 +裾 +褂 +褄 +複 +褌 +褐 +褒 +褥 +褪 +褶 +褻 +襄 +襖 +襞 +襟 +襠 +襦 +襪 +襲 +襴 +襷 +西 +要 +覆 +覇 +覈 +見 +規 +視 +覗 +覚 +覧 +親 +覲 +観 +覺 +觀 +角 +解 +触 +言 +訂 +計 +討 +訓 +託 +記 +訛 +訟 +訢 +訥 +訪 +設 +許 +訳 +訴 +訶 +診 +註 +証 +詐 +詔 +評 +詛 +詞 +詠 +詢 +詣 +試 +詩 +詫 +詮 +詰 +話 +該 +詳 +誄 +誅 +誇 +誉 +誌 +認 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +説 +読 +誰 +課 +誼 +誾 +調 +談 +請 +諌 +諍 +諏 +諒 +論 +諚 +諜 +諟 +諡 +諦 +諧 +諫 +諭 +諮 +諱 +諶 +諷 +諸 +諺 +諾 +謀 +謄 +謌 +謎 +謗 +謙 +謚 +講 +謝 +謡 +謫 +謬 +謹 +證 +識 +譚 +譛 +譜 +警 +譬 +譯 +議 +譲 +譴 +護 +讀 +讃 +讐 +讒 +谷 +谿 +豅 +豆 +豊 +豎 +豐 +豚 +象 +豪 +豫 +豹 +貌 +貝 +貞 +負 +財 +貢 +貧 +貨 +販 +貪 +貫 +責 +貯 +貰 +貴 +買 +貸 +費 +貼 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賎 +賑 +賓 +賛 +賜 +賞 +賠 +賢 +賣 +賤 +賦 +質 +賭 +購 +賽 +贄 +贅 +贈 +贋 +贔 +贖 +赤 +赦 +走 +赴 +起 +超 +越 +趙 +趣 +足 +趺 +趾 +跋 +跏 +距 +跡 +跨 +跪 +路 +跳 +践 +踊 +踏 +踐 +踞 +踪 +踵 +蹄 +蹉 +蹊 +蹟 +蹲 +蹴 +躅 +躇 +躊 +躍 +躑 +躙 +躪 +身 +躬 +躯 +躰 +車 +軋 +軌 +軍 +軒 +軟 +転 +軸 +軻 +軽 +軾 +較 +載 +輌 +輔 +輜 +輝 +輦 +輩 +輪 +輯 +輸 +輿 +轄 +轍 +轟 +轢 +辛 +辞 +辟 +辥 +辦 +辨 +辰 +辱 +農 +辺 +辻 +込 +迂 +迅 +迎 +近 +返 +迢 +迦 +迪 +迫 +迭 +述 +迷 +迹 +追 +退 +送 +逃 +逅 +逆 +逍 +透 +逐 +逓 +途 +逕 +逗 +這 +通 +逝 +逞 +速 +造 +逢 +連 +逮 +週 +進 +逸 +逼 +遁 +遂 +遅 +遇 +遊 +運 +遍 +過 +遐 +道 +達 +違 +遙 +遜 +遠 +遡 +遣 +遥 +適 +遭 +遮 +遯 +遵 +遷 +選 +遺 +遼 +避 +邀 +邁 +邂 +邃 +還 +邇 +邉 +邊 +邑 +那 +邦 +邨 +邪 +邯 +邵 +邸 +郁 +郊 +郎 +郡 +郢 +部 +郭 +郴 +郵 +郷 +都 +鄂 +鄙 +鄭 +鄰 +鄲 +酉 +酋 +酌 +配 +酎 +酒 +酔 +酢 +酥 +酪 +酬 +酵 +酷 +酸 +醍 +醐 +醒 +醗 +醜 +醤 +醪 +醵 +醸 +采 +釈 +釉 +釋 +里 +重 +野 +量 +釐 +金 +釘 +釜 +針 +釣 +釧 +釿 +鈍 +鈎 +鈐 +鈔 +鈞 +鈦 +鈴 +鈷 +鈸 +鈿 +鉄 +鉇 +鉉 +鉋 +鉛 +鉢 +鉤 +鉦 +鉱 +鉾 +銀 +銃 +銅 +銈 +銑 +銕 +銘 +銚 +銜 +銭 +鋏 +鋒 +鋤 +鋭 +鋲 +鋳 +鋸 +鋺 +鋼 +錆 +錍 +錐 +錘 +錠 +錣 +錦 +錫 +錬 +錯 +録 +錵 +鍋 +鍍 +鍑 +鍔 +鍛 +鍬 +鍮 +鍵 +鍼 +鍾 +鎌 +鎖 +鎗 +鎚 +鎧 +鎬 +鎮 +鎰 +鎹 +鏃 +鏑 +鏡 +鐃 +鐇 +鐐 +鐔 +鐘 +鐙 +鐚 +鐡 +鐵 +鐸 +鑁 +鑊 +鑑 +鑒 +鑚 +鑠 +鑢 +鑰 +鑵 +鑷 +鑼 +鑽 +鑿 +長 +門 +閃 +閇 +閉 +開 +閏 +閑 +間 +閔 +閘 +関 +閣 +閤 +閥 +閦 +閨 +閬 +閲 +閻 +閼 +閾 +闇 +闍 +闔 +闕 +闘 +關 +闡 +闢 +闥 +阜 +阪 +阮 +阯 +防 +阻 +阿 +陀 +陂 +附 +陌 +降 +限 +陛 +陞 +院 +陣 +除 +陥 +陪 +陬 +陰 +陳 +陵 +陶 +陸 +険 +陽 +隅 +隆 +隈 +隊 +隋 +階 +随 +隔 +際 +障 +隠 +隣 +隧 +隷 +隻 +隼 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雊 +雋 +雌 +雍 +雑 +雖 +雙 +雛 +離 +難 +雨 +雪 +雫 +雰 +雲 +零 +雷 +雹 +電 +需 +震 +霊 +霍 +霖 +霜 +霞 +霧 +霰 +露 +靈 +青 +靖 +静 +靜 +非 +面 +革 +靫 +靭 +靱 +靴 +靺 +鞁 +鞄 +鞆 +鞋 +鞍 +鞏 +鞘 +鞠 +鞨 +鞭 +韋 +韓 +韜 +韮 +音 +韶 +韻 +響 +頁 +頂 +頃 +項 +順 +須 +頌 +預 +頑 +頒 +頓 +領 +頚 +頬 +頭 +頴 +頸 +頻 +頼 +顆 +題 +額 +顎 +顔 +顕 +顗 +願 +顛 +類 +顧 +顯 +風 +飛 +食 +飢 +飩 +飫 +飯 +飲 +飴 +飼 +飽 +飾 +餃 +餅 +餉 +養 +餌 +餐 +餓 +餘 +餝 +餡 +館 +饂 +饅 +饉 +饋 +饌 +饒 +饗 +首 +馗 +香 +馨 +馬 +馳 +馴 +駄 +駅 +駆 +駈 +駐 +駒 +駕 +駝 +駿 +騁 +騎 +騏 +騒 +験 +騙 +騨 +騰 +驕 +驚 +驛 +驢 +骨 +骸 +髄 +體 +高 +髙 +髢 +髪 +髭 +髮 +髷 +髻 +鬘 +鬚 +鬢 +鬨 +鬯 +鬱 +鬼 +魁 +魂 +魄 +魅 +魏 +魔 +魚 +魯 +鮎 +鮑 +鮒 +鮪 +鮫 +鮭 +鮮 +鯉 +鯔 +鯖 +鯛 +鯨 +鯰 +鯱 +鰐 +鰒 +鰭 +鰯 +鰰 +鰹 +鰻 +鱈 +鱒 +鱗 +鱧 +鳥 +鳩 +鳰 +鳳 +鳴 +鳶 +鴈 +鴉 +鴎 +鴛 +鴟 +鴦 +鴨 +鴫 +鴻 +鵄 +鵜 +鵞 +鵡 +鵬 +鵲 +鵺 +鶉 +鶏 +鶯 +鶴 +鷄 +鷙 +鷲 +鷹 +鷺 +鸚 +鸞 +鹸 +鹽 +鹿 +麁 +麒 +麓 +麗 +麝 +麞 +麟 +麦 +麩 +麹 +麺 +麻 +麾 +麿 +黄 +黌 +黍 +黒 +黙 +黛 +黠 +鼈 +鼉 +鼎 +鼓 +鼠 +鼻 +齊 +齋 +齟 +齢 +齬 +龍 +龕 +龗 +! +# +% +& +( +) ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +R +S +T +U +V +W +X +Z +a +c +d +e +f +h +i +j +k +l +m +n +o +p +r +s +t +u +y +z +~ +・ + diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..d506b691bd1a6c55299ad89a72cf3a69a2c879a9 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt @@ -0,0 +1,153 @@ +k +a +_ +i +m +g +/ +1 +2 +I +L +S +V +R +C +0 +v +l +6 +4 +8 +. +j +p +ಗ +ು +ಣ +ಪ +ಡ +ಿ +ಸ +ಲ +ಾ +ದ +್ +7 +5 +3 +ವ +ಷ +ಬ +ಹ +ೆ +9 +ಅ +ಳ +ನ +ರ +ಉ +ಕ +ಎ +ೇ +ಂ +ೈ +ೊ +ೀ +ಯ +ೋ +ತ +ಶ +ಭ +ಧ +ಚ +ಜ +ೂ +ಮ +ಒ +ೃ +ಥ +ಇ +ಟ +ಖ +ಆ +ಞ +ಫ +- +ಢ +ಊ +ಓ +ಐ +ಃ +ಘ +ಝ +ೌ +ಠ +ಛ +ಔ +ಏ +ಈ +ಋ +೨ +೦ +೧ +೮ +೯ +೪ +, +೫ +೭ +೩ +೬ +ಙ +s +c +e +n +w +o +u +t +d +E +A +T +B +Z +N +G +O +q +z +r +x +P +K +M +J +U +D +f +F +h +b +W +Y +y +H +X +Q +' +# +& +! +@ +$ +: +% +é +É +( +? ++ + diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..a13899f14dfe3bfc25b34904390c7b1e4ed8674b --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt @@ -0,0 +1,3688 @@ +! +" +# +$ +% +& +' +* ++ +- +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +© +° +² +½ +Á +Ä +Å +Ç +É +Í +Î +Ó +Ö +× +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ð +ñ +ò +ó +ô +õ +ö +ø +ú +û +ü +ý +ā +ă +ą +ć +Č +č +đ +ē +ė +ę +ě +ğ +ī +İ +ı +Ł +ł +ń +ň +ō +ř +Ş +ş +Š +š +ţ +ū +ź +ż +Ž +ž +Ș +ș +Α +Δ +α +λ +φ +Г +О +а +в +л +о +р +с +т +я +​ +’ +“ +” +→ +∇ +∼ +「 +」 +ア +カ +グ +ニ +ラ +ン +ㄱ +ㄴ +ㄷ +ㄸ +ㄹ +ㅂ +ㅅ +ㅆ +ㅇ +ㅈ +ㅊ +ㅋ +ㅌ +ㅎ +ㅓ +ㅜ +ㅣ +一 +丁 +七 +三 +上 +下 +不 +丑 +世 +丘 +丞 +中 +丸 +丹 +主 +乃 +久 +之 +乎 +乘 +九 +也 +乳 +乾 +事 +二 +云 +互 +五 +井 +亞 +亡 +交 +亥 +亨 +享 +京 +亭 +人 +仁 +今 +他 +仙 +代 +令 +以 +仰 +仲 +件 +任 +企 +伊 +伍 +伎 +伏 +伐 +休 +伯 +伴 +伸 +佃 +佈 +位 +低 +住 +佐 +何 +佛 +作 +使 +來 +供 +依 +侯 +侵 +侶 +便 +俗 +保 +俠 +信 +修 +俱 +俳 +倉 +個 +倍 +倒 +候 +借 +値 +倫 +倭 +假 +偈 +偉 +偏 +停 +偶 +傅 +傑 +傳 +傷 +傾 +像 +僞 +僥 +僧 +價 +儀 +儉 +儒 +優 +儼 +兀 +允 +元 +兆 +先 +光 +克 +兒 +入 +內 +全 +八 +公 +六 +共 +兵 +其 +具 +典 +兼 +再 +冠 +冥 +冶 +准 +凞 +凡 +凱 +出 +函 +刀 +分 +刊 +刑 +列 +初 +判 +別 +利 +到 +制 +券 +刺 +刻 +則 +前 +剛 +副 +創 +劃 +劑 +力 +功 +加 +劣 +助 +劫 +勇 +動 +務 +勝 +勢 +勳 +勸 +匈 +化 +北 +匠 +區 +十 +千 +午 +半 +卍 +卑 +卒 +卓 +南 +博 +卜 +占 +卦 +印 +危 +卵 +卷 +卽 +卿 +厄 +原 +厦 +去 +參 +又 +叉 +友 +反 +叔 +受 +口 +古 +句 +可 +台 +史 +右 +司 +各 +合 +吉 +同 +名 +后 +吏 +吐 +君 +吠 +吳 +呂 +告 +周 +味 +呵 +命 +和 +咳 +咸 +咽 +哀 +品 +哨 +哮 +哲 +唐 +唯 +唱 +商 +問 +啼 +善 +喆 +喉 +喜 +喩 +喪 +嘗 +器 +嚴 +囊 +四 +回 +因 +困 +固 +圈 +國 +圍 +園 +圓 +圖 +團 +土 +在 +地 +均 +坊 +坐 +坑 +坵 +型 +垢 +城 +域 +埴 +執 +培 +基 +堂 +堅 +堆 +堤 +堯 +報 +場 +塔 +塚 +塞 +塵 +境 +墜 +墟 +墨 +墳 +墾 +壁 +壇 +壓 +壤 +士 +壬 +壯 +壺 +壽 +夏 +夕 +外 +多 +夜 +夢 +大 +天 +太 +夫 +央 +失 +夷 +奄 +奇 +奉 +奎 +奏 +契 +奔 +奮 +女 +奴 +好 +如 +妄 +妊 +妖 +妙 +始 +姑 +姓 +姚 +姜 +威 +婆 +婚 +婦 +媒 +媚 +子 +孔 +字 +存 +孝 +孟 +季 +孤 +孫 +學 +孺 +宇 +守 +安 +宋 +宗 +官 +宙 +定 +客 +宣 +室 +宮 +害 +家 +容 +寂 +寃 +寄 +寅 +密 +寇 +富 +寒 +寓 +實 +審 +寫 +寬 +寶 +寸 +寺 +封 +將 +專 +尊 +對 +小 +少 +尙 +尹 +尼 +尿 +局 +居 +屈 +屋 +屍 +屎 +屛 +層 +屬 +山 +岐 +岡 +岩 +岳 +岸 +峙 +峰 +島 +峻 +峽 +崇 +崔 +崖 +崩 +嶋 +巖 +川 +州 +巢 +工 +左 +巧 +巨 +巫 +差 +己 +巷 +市 +布 +帝 +師 +帶 +常 +帽 +幕 +干 +平 +年 +幹 +幻 +幼 +幽 +庇 +序 +店 +府 +度 +座 +庫 +庭 +康 +廟 +廣 +廳 +延 +廷 +建 +廻 +弁 +式 +弑 +弓 +引 +弘 +弟 +弱 +張 +强 +弼 +彌 +彛 +形 +彬 +影 +役 +彼 +彿 +往 +征 +待 +律 +後 +徐 +徑 +得 +從 +循 +微 +德 +徹 +心 +必 +忌 +忍 +志 +忠 +思 +怡 +急 +性 +恐 +恒 +恨 +恩 +悅 +悖 +患 +悲 +情 +惑 +惟 +惠 +惡 +想 +惺 +愁 +意 +愚 +愛 +感 +愼 +慈 +態 +慕 +慣 +慧 +慾 +憂 +憤 +憺 +應 +懸 +戎 +成 +我 +戟 +戮 +戰 +戴 +戶 +房 +所 +手 +才 +打 +批 +承 +技 +抄 +把 +抗 +抱 +抽 +拇 +拓 +拘 +拙 +拜 +拾 +持 +指 +捌 +捨 +捿 +授 +掌 +排 +接 +推 +提 +揚 +揭 +援 +損 +搗 +摩 +播 +操 +擒 +擔 +擘 +據 +擧 +攘 +攝 +攬 +支 +改 +攻 +放 +政 +故 +敍 +敎 +救 +敗 +散 +敬 +整 +數 +文 +斗 +料 +斛 +斜 +斧 +斯 +新 +斷 +方 +於 +施 +旋 +族 +旗 +日 +旨 +早 +旱 +昌 +明 +易 +昔 +星 +春 +昧 +昭 +是 +時 +晉 +晋 +晩 +普 +景 +晴 +晶 +智 +暈 +暑 +暗 +暘 +曉 +曜 +曠 +曦 +曰 +曲 +書 +曹 +曼 +曾 +最 +會 +月 +有 +朋 +服 +望 +朝 +期 +木 +未 +末 +本 +朱 +朴 +李 +材 +村 +杖 +杜 +杞 +杭 +杯 +東 +松 +板 +林 +果 +枝 +枯 +枰 +枾 +柏 +柑 +柱 +栗 +校 +栢 +核 +根 +格 +桀 +桂 +案 +桎 +桑 +桓 +桔 +梁 +梏 +梓 +梗 +條 +梨 +梵 +棗 +棟 +森 +植 +椒 +楊 +楓 +楚 +業 +楮 +極 +榮 +槃 +槍 +樂 +樓 +樗 +樣 +樸 +樹 +樺 +樽 +橄 +橋 +橘 +機 +橡 +檀 +檎 +權 +欌 +欖 +次 +欲 +歌 +歐 +止 +正 +此 +步 +武 +歲 +歸 +死 +殖 +段 +殷 +殺 +殿 +毅 +母 +毒 +比 +毛 +氏 +民 +氣 +水 +永 +求 +汎 +汗 +江 +池 +沅 +沒 +沖 +沙 +沛 +河 +油 +治 +沼 +沿 +泉 +泊 +法 +泗 +泡 +波 +注 +泰 +洋 +洙 +洛 +洞 +津 +洲 +活 +派 +流 +浅 +浦 +浮 +浴 +海 +涅 +涇 +消 +涌 +液 +淑 +淡 +淨 +淫 +深 +淳 +淵 +淸 +渠 +渡 +游 +渾 +湖 +湯 +源 +溪 +溫 +溶 +滄 +滅 +滋 +滯 +滿 +漁 +漆 +漢 +漫 +漸 +潑 +潤 +潭 +澄 +澎 +澤 +澳 +澹 +濁 +濕 +濟 +濤 +濯 +瀋 +瀝 +灣 +火 +灰 +灸 +災 +炎 +炭 +点 +烈 +烏 +烙 +焚 +無 +焦 +然 +煌 +煎 +照 +煬 +煮 +熟 +熱 +燁 +燈 +燔 +燕 +燥 +燧 +燮 +爲 +爵 +父 +片 +版 +牌 +牛 +牝 +牟 +牡 +物 +特 +犧 +犬 +狀 +狗 +猥 +猩 +猪 +獨 +獵 +獸 +獻 +玄 +玉 +王 +玲 +珍 +珠 +珪 +班 +現 +球 +理 +琴 +瑞 +瑟 +瑪 +璃 +璋 +璽 +瓜 +瓦 +甑 +甘 +生 +産 +用 +甫 +田 +由 +甲 +申 +男 +界 +畏 +留 +畜 +畢 +略 +番 +異 +畵 +當 +畸 +疏 +疑 +疫 +疹 +疼 +病 +症 +痔 +痛 +痺 +瘀 +瘍 +瘡 +療 +癌 +癖 +登 +發 +白 +百 +的 +皆 +皇 +皮 +盂 +盆 +益 +盛 +盜 +盟 +盡 +盤 +盧 +目 +直 +相 +省 +看 +眞 +眼 +睡 +督 +瞋 +矢 +矣 +知 +短 +石 +破 +碍 +碑 +磁 +磨 +磬 +示 +社 +祇 +祖 +祝 +神 +祥 +祭 +祺 +禁 +禅 +禍 +福 +禦 +禪 +禮 +禹 +禽 +禾 +秀 +私 +秉 +秋 +科 +秘 +秤 +秦 +秩 +移 +稀 +稗 +種 +稱 +稷 +稼 +稽 +穀 +穆 +積 +空 +窮 +竅 +立 +章 +童 +竭 +端 +竹 +笑 +符 +第 +筆 +等 +筍 +答 +策 +箋 +箕 +管 +箱 +節 +篇 +簡 +米 +粉 +粘 +粥 +精 +糖 +糞 +系 +紀 +紂 +約 +紅 +紋 +純 +紙 +級 +素 +索 +紫 +紬 +累 +細 +紳 +終 +組 +結 +絡 +統 +絲 +絶 +絹 +經 +綠 +維 +綱 +網 +綸 +綽 +緖 +線 +緣 +緯 +縣 +縱 +總 +織 +繡 +繩 +繪 +繭 +纂 +續 +罕 +置 +罰 +羅 +羊 +美 +群 +義 +羽 +翁 +習 +翟 +老 +考 +者 +而 +耐 +耕 +耳 +聃 +聖 +聞 +聰 +聲 +職 +肇 +肉 +肖 +肝 +股 +肥 +育 +肺 +胃 +胎 +胚 +胞 +胡 +胥 +能 +脂 +脈 +脚 +脛 +脣 +脩 +脫 +脯 +脾 +腋 +腎 +腫 +腸 +腹 +膜 +膠 +膨 +膽 +臆 +臟 +臣 +臥 +臨 +自 +至 +致 +臺 +臼 +臾 +與 +興 +舊 +舌 +舍 +舒 +舜 +舟 +般 +船 +艦 +良 +色 +芋 +花 +芳 +芽 +苑 +苔 +苕 +苛 +苞 +若 +苦 +英 +茂 +茵 +茶 +茹 +荀 +荇 +草 +荒 +荷 +莊 +莫 +菊 +菌 +菜 +菩 +菫 +華 +菴 +菽 +萊 +萍 +萬 +落 +葉 +著 +葛 +董 +葬 +蒙 +蒜 +蒲 +蒸 +蒿 +蓮 +蔓 +蔘 +蔡 +蔬 +蕃 +蕉 +蕓 +薄 +薑 +薛 +薩 +薪 +薺 +藏 +藝 +藤 +藥 +藩 +藻 +蘆 +蘇 +蘊 +蘚 +蘭 +虎 +處 +虛 +虞 +虹 +蜀 +蜂 +蜜 +蝕 +蝶 +融 +蟬 +蟲 +蠶 +蠻 +血 +衆 +行 +術 +衛 +衡 +衣 +表 +袁 +裔 +裕 +裙 +補 +製 +複 +襄 +西 +要 +見 +視 +親 +覺 +觀 +角 +解 +言 +訂 +訊 +訓 +託 +記 +訣 +設 +診 +註 +評 +詩 +話 +詵 +誅 +誌 +認 +誕 +語 +誠 +誤 +誥 +誦 +說 +調 +談 +諍 +論 +諡 +諫 +諭 +諸 +謙 +講 +謝 +謠 +證 +識 +譚 +譜 +譯 +議 +護 +讀 +變 +谷 +豆 +豊 +豚 +象 +豪 +豫 +貝 +貞 +財 +貧 +貨 +貪 +貫 +貴 +貸 +費 +資 +賊 +賓 +賞 +賢 +賣 +賦 +質 +贍 +赤 +赫 +走 +起 +超 +越 +趙 +趣 +趨 +足 +趾 +跋 +跡 +路 +踏 +蹟 +身 +躬 +車 +軍 +軒 +軟 +載 +輓 +輕 +輪 +輯 +輸 +輻 +輿 +轅 +轉 +辨 +辭 +辯 +辰 +農 +近 +迦 +述 +追 +逆 +透 +逐 +通 +逝 +造 +逢 +連 +進 +逵 +遂 +遊 +運 +遍 +過 +道 +達 +遠 +遡 +適 +遷 +選 +遺 +遽 +還 +邊 +邑 +那 +邪 +郞 +郡 +部 +都 +鄒 +鄕 +鄭 +鄲 +配 +酒 +酸 +醉 +醫 +醯 +釋 +里 +重 +野 +量 +釐 +金 +針 +鈍 +鈴 +鉞 +銀 +銅 +銘 +鋼 +錄 +錢 +錦 +鎭 +鏡 +鐘 +鐵 +鑑 +鑛 +長 +門 +閃 +開 +間 +閔 +閣 +閥 +閭 +閻 +闕 +關 +阪 +防 +阿 +陀 +降 +限 +陝 +院 +陰 +陳 +陵 +陶 +陸 +陽 +隆 +隊 +隋 +階 +際 +障 +隣 +隨 +隱 +隷 +雀 +雄 +雅 +集 +雇 +雌 +雖 +雙 +雜 +離 +難 +雨 +雪 +雲 +電 +霜 +露 +靈 +靑 +靖 +靜 +非 +面 +革 +靴 +鞏 +韓 +音 +韶 +韻 +順 +須 +頊 +頌 +領 +頭 +顔 +願 +顚 +類 +顯 +風 +飛 +食 +飢 +飮 +飯 +飾 +養 +餓 +餘 +首 +香 +馨 +馬 +駒 +騫 +騷 +驕 +骨 +骸 +髓 +體 +高 +髥 +髮 +鬪 +鬱 +鬼 +魏 +魔 +魚 +魯 +鮮 +鰍 +鰐 +鳥 +鳧 +鳳 +鴨 +鵲 +鶴 +鷄 +鷹 +鹽 +鹿 +麗 +麥 +麻 +黃 +黑 +默 +點 +黨 +鼎 +齊 +齋 +齒 +龍 +龜 +가 +각 +간 +갇 +갈 +갉 +감 +갑 +값 +갓 +갔 +강 +갖 +갗 +같 +갚 +갛 +개 +객 +갠 +갤 +갬 +갭 +갯 +갰 +갱 +갸 +걀 +걔 +걘 +거 +걱 +건 +걷 +걸 +검 +겁 +것 +겄 +겅 +겆 +겉 +겊 +겋 +게 +겐 +겔 +겟 +겠 +겡 +겨 +격 +겪 +견 +결 +겸 +겹 +겻 +겼 +경 +곁 +계 +곕 +곗 +고 +곡 +곤 +곧 +골 +곪 +곬 +곯 +곰 +곱 +곳 +공 +곶 +과 +곽 +관 +괄 +괌 +광 +괘 +괜 +괭 +괴 +괸 +굉 +교 +구 +국 +군 +굳 +굴 +굵 +굶 +굼 +굽 +굿 +궁 +궂 +궈 +권 +궐 +궜 +궝 +궤 +귀 +귄 +귈 +귓 +규 +균 +귤 +그 +극 +근 +글 +긁 +금 +급 +긋 +긍 +기 +긴 +길 +김 +깁 +깃 +깅 +깊 +까 +깍 +깎 +깐 +깔 +깜 +깝 +깟 +깡 +깥 +깨 +깬 +깰 +깻 +깼 +깽 +꺄 +꺼 +꺽 +꺾 +껀 +껄 +껌 +껍 +껏 +껐 +껑 +께 +껴 +꼈 +꼍 +꼐 +꼬 +꼭 +꼴 +꼼 +꼽 +꼿 +꽁 +꽂 +꽃 +꽉 +꽝 +꽤 +꽥 +꾀 +꾜 +꾸 +꾹 +꾼 +꿀 +꿇 +꿈 +꿉 +꿋 +꿍 +꿎 +꿔 +꿨 +꿩 +꿰 +꿴 +뀄 +뀌 +뀐 +뀔 +뀜 +뀝 +끄 +끈 +끊 +끌 +끓 +끔 +끕 +끗 +끙 +끝 +끼 +끽 +낀 +낄 +낌 +낍 +낏 +낑 +나 +낙 +낚 +난 +낟 +날 +낡 +남 +납 +낫 +났 +낭 +낮 +낯 +낱 +낳 +내 +낵 +낸 +낼 +냄 +냅 +냇 +냈 +냉 +냐 +냔 +냘 +냥 +너 +넉 +넋 +넌 +널 +넓 +넘 +넙 +넛 +넜 +넝 +넣 +네 +넥 +넨 +넬 +넴 +넵 +넷 +넸 +넹 +녀 +녁 +년 +념 +녔 +녕 +녘 +녜 +노 +녹 +논 +놀 +놈 +놋 +농 +높 +놓 +놔 +놨 +뇌 +뇨 +뇩 +뇽 +누 +눅 +눈 +눌 +눔 +눕 +눗 +눠 +눴 +뉘 +뉜 +뉩 +뉴 +늄 +늅 +늉 +느 +늑 +는 +늘 +늙 +늠 +늡 +능 +늦 +늪 +늬 +니 +닉 +닌 +닐 +님 +닙 +닛 +닝 +닢 +다 +닥 +닦 +단 +닫 +달 +닭 +닮 +닯 +닳 +담 +답 +닷 +당 +닻 +닿 +대 +댁 +댄 +댈 +댐 +댑 +댓 +댔 +댕 +댜 +더 +덕 +덖 +던 +덜 +덟 +덤 +덥 +덧 +덩 +덫 +덮 +데 +덱 +덴 +델 +뎀 +뎃 +뎅 +뎌 +뎠 +뎨 +도 +독 +돈 +돋 +돌 +돔 +돕 +돗 +동 +돛 +돝 +돼 +됐 +되 +된 +될 +됨 +됩 +됴 +두 +둑 +둔 +둘 +둠 +둡 +둣 +둥 +둬 +뒀 +뒤 +뒬 +뒷 +뒹 +듀 +듈 +듐 +드 +득 +든 +듣 +들 +듦 +듬 +듭 +듯 +등 +듸 +디 +딕 +딘 +딛 +딜 +딤 +딥 +딧 +딨 +딩 +딪 +따 +딱 +딴 +딸 +땀 +땄 +땅 +때 +땐 +땔 +땜 +땝 +땠 +땡 +떠 +떡 +떤 +떨 +떫 +떰 +떱 +떳 +떴 +떵 +떻 +떼 +떽 +뗀 +뗄 +뗍 +뗏 +뗐 +뗑 +또 +똑 +똘 +똥 +뙤 +뚜 +뚝 +뚤 +뚫 +뚱 +뛰 +뛴 +뛸 +뜀 +뜁 +뜨 +뜩 +뜬 +뜯 +뜰 +뜸 +뜻 +띄 +띈 +띌 +띔 +띕 +띠 +띤 +띨 +띱 +띵 +라 +락 +란 +랄 +람 +랍 +랏 +랐 +랑 +랒 +랗 +래 +랙 +랜 +랠 +램 +랩 +랫 +랬 +랭 +랴 +략 +량 +러 +럭 +런 +럴 +럼 +럽 +럿 +렀 +렁 +렇 +레 +렉 +렌 +렐 +렘 +렙 +렛 +렝 +려 +력 +련 +렬 +렴 +렵 +렷 +렸 +령 +례 +로 +록 +론 +롤 +롬 +롭 +롯 +롱 +롸 +롹 +뢰 +뢴 +뢸 +룃 +료 +룐 +룡 +루 +룩 +룬 +룰 +룸 +룹 +룻 +룽 +뤄 +뤘 +뤼 +류 +륙 +륜 +률 +륨 +륭 +르 +륵 +른 +를 +름 +릅 +릇 +릉 +릎 +리 +릭 +린 +릴 +림 +립 +릿 +링 +마 +막 +만 +많 +맏 +말 +맑 +맘 +맙 +맛 +망 +맞 +맡 +맣 +매 +맥 +맨 +맬 +맴 +맵 +맷 +맸 +맹 +맺 +먀 +먁 +머 +먹 +먼 +멀 +멈 +멋 +멍 +멎 +메 +멕 +멘 +멜 +멤 +멥 +멧 +멩 +며 +멱 +면 +멸 +몄 +명 +몇 +모 +목 +몫 +몬 +몰 +몸 +몹 +못 +몽 +뫼 +묘 +무 +묵 +묶 +문 +묻 +물 +묽 +뭄 +뭅 +뭇 +뭉 +뭍 +뭏 +뭐 +뭔 +뭘 +뭡 +뭣 +뮈 +뮌 +뮐 +뮤 +뮬 +므 +믈 +믐 +미 +믹 +민 +믿 +밀 +밈 +밉 +밋 +밌 +밍 +및 +밑 +바 +박 +밖 +반 +받 +발 +밝 +밟 +밤 +밥 +밧 +방 +밭 +배 +백 +밴 +밸 +뱀 +뱁 +뱃 +뱄 +뱅 +뱉 +뱍 +뱐 +버 +벅 +번 +벌 +범 +법 +벗 +벙 +벚 +베 +벡 +벤 +벨 +벰 +벱 +벳 +벵 +벼 +벽 +변 +별 +볍 +볏 +볐 +병 +볕 +보 +복 +볶 +본 +볼 +봄 +봅 +봇 +봉 +봐 +봤 +뵈 +뵐 +뵙 +부 +북 +분 +붇 +불 +붉 +붐 +붓 +붕 +붙 +뷔 +뷰 +뷴 +뷸 +브 +븐 +블 +비 +빅 +빈 +빌 +빔 +빕 +빗 +빙 +빚 +빛 +빠 +빡 +빤 +빨 +빳 +빴 +빵 +빻 +빼 +빽 +뺀 +뺄 +뺌 +뺏 +뺐 +뺑 +뺨 +뻐 +뻑 +뻔 +뻗 +뻘 +뻣 +뻤 +뻥 +뻬 +뼈 +뼉 +뼘 +뽀 +뽈 +뽐 +뽑 +뽕 +뾰 +뿌 +뿍 +뿐 +뿔 +뿜 +쁘 +쁜 +쁠 +쁨 +삐 +삔 +삘 +사 +삭 +삯 +산 +살 +삵 +삶 +삼 +삽 +삿 +샀 +상 +샅 +새 +색 +샌 +샐 +샘 +샙 +샛 +샜 +생 +샤 +샨 +샬 +샴 +샵 +샷 +샹 +서 +석 +섞 +선 +섣 +설 +섬 +섭 +섯 +섰 +성 +섶 +세 +섹 +센 +셀 +셈 +셉 +셋 +셌 +셍 +셔 +션 +셜 +셨 +셰 +셴 +셸 +소 +속 +손 +솔 +솜 +솝 +솟 +송 +솥 +쇄 +쇠 +쇤 +쇳 +쇼 +숀 +숄 +숍 +수 +숙 +순 +숟 +술 +숨 +숩 +숫 +숭 +숯 +숱 +숲 +숴 +쉐 +쉘 +쉬 +쉭 +쉰 +쉴 +쉼 +쉽 +슈 +슐 +슘 +슛 +슝 +스 +슥 +슨 +슬 +슭 +슴 +습 +슷 +승 +시 +식 +신 +싣 +실 +싫 +심 +십 +싯 +싱 +싶 +싸 +싹 +싼 +쌀 +쌈 +쌉 +쌌 +쌍 +쌓 +쌔 +쌘 +쌩 +써 +썩 +썬 +썰 +썸 +썹 +썼 +썽 +쎄 +쎈 +쏘 +쏙 +쏜 +쏟 +쏠 +쏭 +쏴 +쐈 +쐐 +쐬 +쑤 +쑥 +쑨 +쒀 +쒔 +쓰 +쓱 +쓴 +쓸 +씀 +씁 +씌 +씨 +씩 +씬 +씰 +씸 +씹 +씻 +씽 +아 +악 +안 +앉 +않 +알 +앎 +앓 +암 +압 +앗 +았 +앙 +앞 +애 +액 +앤 +앨 +앰 +앱 +앳 +앴 +앵 +야 +약 +얀 +얄 +얇 +얌 +얍 +얏 +양 +얕 +얗 +얘 +얜 +어 +억 +언 +얹 +얻 +얼 +얽 +엄 +업 +없 +엇 +었 +엉 +엊 +엌 +엎 +에 +엑 +엔 +엘 +엠 +엡 +엣 +엥 +여 +역 +엮 +연 +열 +엷 +염 +엽 +엾 +엿 +였 +영 +옅 +옆 +옇 +예 +옌 +옐 +옙 +옛 +오 +옥 +온 +올 +옭 +옮 +옳 +옴 +옵 +옷 +옹 +옻 +와 +왁 +완 +왈 +왑 +왓 +왔 +왕 +왜 +왠 +왱 +외 +왼 +요 +욕 +욘 +욜 +욤 +용 +우 +욱 +운 +울 +움 +웁 +웃 +웅 +워 +웍 +원 +월 +웜 +웠 +웡 +웨 +웬 +웰 +웸 +웹 +위 +윅 +윈 +윌 +윔 +윗 +윙 +유 +육 +윤 +율 +윱 +윳 +융 +으 +윽 +은 +을 +읊 +음 +읍 +응 +의 +읜 +읠 +이 +익 +인 +일 +읽 +잃 +임 +입 +잇 +있 +잉 +잊 +잎 +자 +작 +잔 +잖 +잘 +잠 +잡 +잣 +잤 +장 +잦 +재 +잭 +잰 +잴 +잽 +잿 +쟀 +쟁 +쟈 +쟉 +쟤 +저 +적 +전 +절 +젊 +점 +접 +젓 +정 +젖 +제 +젝 +젠 +젤 +젬 +젭 +젯 +져 +젼 +졀 +졌 +졍 +조 +족 +존 +졸 +좀 +좁 +종 +좇 +좋 +좌 +좍 +좽 +죄 +죠 +죤 +주 +죽 +준 +줄 +줌 +줍 +줏 +중 +줘 +줬 +쥐 +쥔 +쥘 +쥬 +쥴 +즈 +즉 +즌 +즐 +즘 +즙 +증 +지 +직 +진 +짇 +질 +짊 +짐 +집 +짓 +징 +짖 +짙 +짚 +짜 +짝 +짠 +짢 +짤 +짧 +짬 +짭 +짰 +짱 +째 +짹 +짼 +쨀 +쨉 +쨋 +쨌 +쨍 +쩄 +쩌 +쩍 +쩐 +쩔 +쩜 +쩝 +쩡 +쩨 +쪄 +쪘 +쪼 +쪽 +쪾 +쫀 +쫄 +쫑 +쫓 +쫙 +쬐 +쭈 +쭉 +쭐 +쭙 +쯔 +쯤 +쯧 +찌 +찍 +찐 +찔 +찜 +찝 +찡 +찢 +찧 +차 +착 +찬 +찮 +찰 +참 +찹 +찻 +찼 +창 +찾 +채 +책 +챈 +챌 +챔 +챕 +챗 +챘 +챙 +챠 +챤 +처 +척 +천 +철 +첨 +첩 +첫 +청 +체 +첵 +첸 +첼 +쳄 +쳇 +쳉 +쳐 +쳔 +쳤 +초 +촉 +촌 +촘 +촛 +총 +촨 +촬 +최 +쵸 +추 +축 +춘 +출 +춤 +춥 +춧 +충 +춰 +췄 +췌 +취 +췬 +츄 +츠 +측 +츨 +츰 +층 +치 +칙 +친 +칠 +칡 +침 +칩 +칫 +칭 +카 +칵 +칸 +칼 +캄 +캅 +캇 +캉 +캐 +캔 +캘 +캠 +캡 +캣 +캤 +캥 +캬 +커 +컥 +컨 +컫 +컬 +컴 +컵 +컷 +컸 +컹 +케 +켄 +켈 +켐 +켓 +켕 +켜 +켠 +켤 +켭 +켯 +켰 +코 +콕 +콘 +콜 +콤 +콥 +콧 +콩 +콰 +콱 +콴 +콸 +쾅 +쾌 +쾡 +쾨 +쾰 +쿄 +쿠 +쿡 +쿤 +쿨 +쿰 +쿵 +쿼 +퀀 +퀄 +퀘 +퀭 +퀴 +퀵 +퀸 +퀼 +큐 +큘 +크 +큰 +클 +큼 +큽 +키 +킥 +킨 +킬 +킴 +킵 +킷 +킹 +타 +탁 +탄 +탈 +탉 +탐 +탑 +탓 +탔 +탕 +태 +택 +탠 +탤 +탬 +탭 +탯 +탰 +탱 +터 +턱 +턴 +털 +텀 +텁 +텃 +텄 +텅 +테 +텍 +텐 +텔 +템 +텝 +텡 +텨 +톈 +토 +톡 +톤 +톨 +톰 +톱 +톳 +통 +퇴 +툇 +투 +툭 +툰 +툴 +툼 +퉁 +퉈 +퉜 +튀 +튄 +튈 +튕 +튜 +튠 +튤 +튬 +트 +특 +튼 +튿 +틀 +틈 +틉 +틋 +틔 +티 +틱 +틴 +틸 +팀 +팁 +팅 +파 +팍 +팎 +판 +팔 +팜 +팝 +팟 +팠 +팡 +팥 +패 +팩 +팬 +팰 +팸 +팻 +팼 +팽 +퍼 +퍽 +펀 +펄 +펌 +펍 +펐 +펑 +페 +펙 +펜 +펠 +펨 +펩 +펫 +펭 +펴 +편 +펼 +폄 +폈 +평 +폐 +포 +폭 +폰 +폴 +폼 +폿 +퐁 +표 +푭 +푸 +푹 +푼 +풀 +품 +풋 +풍 +퓨 +퓬 +퓰 +퓸 +프 +픈 +플 +픔 +픕 +피 +픽 +핀 +필 +핌 +핍 +핏 +핑 +하 +학 +한 +할 +핥 +함 +합 +핫 +항 +해 +핵 +핸 +핼 +햄 +햅 +햇 +했 +행 +햐 +향 +헀 +허 +헉 +헌 +헐 +험 +헙 +헛 +헝 +헤 +헥 +헨 +헬 +헴 +헵 +헷 +헹 +혀 +혁 +현 +혈 +혐 +협 +혓 +혔 +형 +혜 +호 +혹 +혼 +홀 +홈 +홉 +홋 +홍 +홑 +화 +확 +환 +활 +홧 +황 +홰 +홱 +횃 +회 +획 +횝 +횟 +횡 +효 +후 +훅 +훈 +훌 +훑 +훔 +훗 +훤 +훨 +훼 +휄 +휑 +휘 +휙 +휜 +휠 +휩 +휭 +휴 +휼 +흄 +흉 +흐 +흑 +흔 +흘 +흙 +흠 +흡 +흣 +흥 +흩 +희 +흰 +흽 +히 +힉 +힌 +힐 +힘 +힙 +힝 +車 +滑 +金 +奈 +羅 +洛 +卵 +欄 +蘭 +郎 +來 +盧 +老 +魯 +綠 +鹿 +論 +雷 +樓 +縷 +凌 +樂 +不 +參 +葉 +沈 +若 +兩 +凉 +梁 +呂 +女 +廬 +麗 +黎 +曆 +歷 +戀 +蓮 +連 +列 +烈 +裂 +念 +獵 +靈 +領 +例 +禮 +醴 +惡 +尿 +料 +遼 +龍 +暈 +柳 +流 +類 +六 +陸 +倫 +律 +栗 +利 +李 +梨 +理 +離 +燐 +林 +臨 +立 +茶 +切 +宅 + diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..e166bf33ecfbdc90ddb3d9743fded23306acabd5 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt @@ -0,0 +1,185 @@ + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +} +¡ +£ +§ +ª +« +­ +° +² +³ +´ +µ +· +º +» +¿ +À +Á + +Ä +Å +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ò +Ó +Ô +Õ +Ö +Ú +Ü +Ý +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ñ +ò +ó +ô +õ +ö +ø +ù +ú +û +ü +ý +ą +Ć +ć +Č +č +Đ +đ +ę +ı +Ł +ł +ō +Œ +œ +Š +š +Ÿ +Ž +ž +ʒ +β +δ +ε +з +Ṡ +‘ +€ +™ diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt new file mode 100644 index 0000000000000000000000000000000000000000..84b885d8352226e49b1d5d791b8f43a663e246aa --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..09e275bae943431ae75f583b9f4519c96161eb85 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv4_doc_dict.txt @@ -0,0 +1,15629 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮! +䰾 +䲁 +丌 +丏 +丟 +並 +乂 +乗 +乩 +乭 +乹 +亀 +亂 +亅 +亊 +亠 +亰 +亶 +亹 +仂 +仉 +仏 +仛 +仫 +仮 +仳 +仵 +仼 +伃 +伋 +伕 +伝 +伷 +伾 +佀 +佁 +佇 +佈 +佉 +佋 +佔 +併 +佹 +佺 +佾 +侁 +侅 +侊 +侖 +侘 +侚 +侞 +価 +侶 +侷 +侹 +俁 +俅 +俋 +俌 +俍 +俛 +俠 +俳 +俴 +俶 +俽 +倈 +倓 +倖 +倗 +倞 +倢 +倣 +値 +倧 +倮 +倻 +偁 +偊 +偍 +偓 +偪 +偲 +側 +偵 +偸 +傃 +傉 +傑 +傒 +傕 +傖 +傜 +傭 +債 +傾 +僅 +僉 +僊 +働 +僔 +僕 +僖 +僙 +僜 +僡 +僩 +僭 +僰 +僱 +僴 +儁 +儂 +儆 +儇 +儈 +儉 +儐 +儔 +儕 +儘 +儚 +儞 +償 +儦 +儫 +儲 +儷 +儺 +儻 +儼 +兌 +児 +兕 +兗 +兪 +冂 +円 +冇 +冊 +冑 +冖 +冧 +冨 +冪 +冫 +冴 +凃 +凜 +凞 +凪 +凵 +刄 +刎 +別 +刦 +刧 +刼 +則 +剋 +剏 +剝 +剣 +剮 +劄 +劊 +劌 +劔 +劬 +効 +劼 +勔 +勖 +勗 +勛 +勞 +勣 +勦 +勱 +勲 +勳 +勵 +勷 +勸 +勻 +匂 +匄 +匏 +匚 +匱 +匸 +卋 +卍 +卐 +卣 +卬 +卮 +卲 +卹 +卺 +卻 +卽 +厓 +厔 +厙 +厭 +厰 +厲 +厴 +厶 +叄 +収 +叕 +叡 +叵 +吔 +吥 +吳 +吶 +呂 +呉 +呎 +呾 +咁 +咑 +咗 +咘 +咟 +咥 +咲 +咼 +咾 +哂 +哏 +哐 +哖 +哱 +唃 +唄 +唫 +唭 +唵 +唸 +啁 +啍 +啚 +啞 +啣 +啯 +啱 +啲 +啷 +喈 +喚 +喢 +喦 +喪 +喲 +喼 +嗄 +嗆 +嗇 +嗊 +嗎 +嗚 +嗢 +嗩 +嗶 +嗹 +嘅 +嘆 +嘍 +嘏 +嘔 +嘗 +嘚 +嘜 +嘥 +嘩 +嘮 +嘯 +嘰 +嘸 +噍 +噏 +噓 +噝 +噠 +噥 +噦 +噯 +噰 +噲 +噴 +噸 +噹 +嚇 +嚈 +嚐 +嚕 +嚗 +嚙 +嚟 +嚤 +嚦 +嚧 +嚨 +嚩 +嚮 +嚳 +嚶 +嚿 +囀 +囂 +囃 +囉 +囑 +囒 +囓 +囝 +団 +囧 +囪 +囮 +囯 +囲 +図 +囶 +囷 +圂 +圄 +圉 +圏 +圓 +圪 +圯 +坌 +坖 +坣 +坬 +坮 +坵 +垈 +垍 +垕 +垞 +垯 +垰 +垵 +垻 +垿 +埅 +埇 +埈 +埏 +埒 +埜 +埡 +埤 +埧 +埨 +埪 +埮 +埴 +埵 +埻 +埼 +堅 +堈 +堉 +堊 +堍 +堖 +堝 +堦 +堮 +堯 +堺 +塀 +塅 +塆 +塋 +塏 +塙 +塜 +塡 +塢 +塤 +塨 +塩 +塭 +塰 +塱 +塲 +塵 +塹 +塽 +墀 +墎 +増 +墘 +墜 +墡 +墣 +墫 +墬 +墮 +墱 +墳 +墺 +墼 +墾 +壄 +壆 +壋 +壌 +壎 +壔 +壘 +壙 +壞 +壟 +壠 +壢 +壩 +壯 +壱 +壺 +変 +夊 +夠 +夤 +夾 +奀 +奐 +奣 +奩 +奫 +奭 +奮 +妀 +妁 +妏 +妑 +妠 +妧 +妭 +妸 +妺 +姀 +姁 +姃 +姈 +姉 +姍 +姦 +姪 +姫 +姮 +姵 +姶 +姸 +娋 +娍 +娎 +娖 +娛 +娫 +娳 +娸 +婁 +婑 +婯 +婻 +婼 +媃 +媊 +媐 +媓 +媖 +媗 +媜 +媞 +媧 +媭 +媯 +媺 +媼 +媿 +嫄 +嫈 +嫘 +嫪 +嫲 +嫳 +嫵 +嫺 +嫻 +嬅 +嬈 +嬋 +嬌 +嬛 +嬝 +嬡 +嬤 +嬨 +嬪 +嬬 +嬭 +嬸 +嬾 +嬿 +孀 +孆 +孋 +孌 +孮 +孻 +孿 +宍 +実 +宧 +宮 +寀 +寁 +寈 +寊 +寔 +寖 +寗 +寘 +寛 +寜 +寢 +審 +寯 +尋 +尗 +尢 +尪 +屄 +屇 +屍 +屓 +屚 +屜 +屢 +屬 +屭 +屺 +屻 +岀 +岈 +岡 +岣 +岧 +岪 +岬 +岰 +岵 +岻 +峅 +峇 +峍 +峘 +峚 +峠 +峴 +峼 +峽 +崁 +崈 +崍 +崐 +崑 +崒 +崗 +崘 +崙 +崚 +崞 +崟 +崠 +崢 +崱 +崵 +崶 +嵎 +嵒 +嵕 +嵖 +嵗 +嵙 +嵛 +嵜 +嵨 +嵮 +嵰 +嵴 +嵻 +嵿 +嶁 +嶃 +嶄 +嶇 +嶋 +嶌 +嶍 +嶒 +嶔 +嶗 +嶝 +嶠 +嶢 +嶦 +嶧 +嶬 +嶰 +嶲 +嶴 +嶷 +嶸 +嶼 +巂 +巄 +巆 +巋 +巌 +巎 +巑 +巒 +巔 +巖 +巘 +巛 +巰 +巶 +巻 +巿 +帔 +帙 +帡 +帢 +帳 +幀 +幃 +幗 +幟 +幣 +幪 +幫 +幵 +幷 +幾 +庀 +庁 +広 +庢 +庲 +庼 +廁 +廂 +廄 +廆 +廈 +廋 +廌 +廍 +廑 +廔 +廕 +廙 +廝 +廞 +廟 +廡 +廢 +廧 +廨 +廩 +廬 +廱 +廸 +廻 +廼 +弁 +弅 +弇 +弉 +弐 +弒 +弔 +弖 +弢 +弨 +弸 +弾 +彀 +彄 +彅 +彆 +彊 +彎 +彔 +彖 +彘 +彙 +彜 +彞 +彠 +彡 +彣 +彥 +彫 +彿 +徂 +徑 +從 +徠 +徧 +徫 +徬 +徭 +徴 +徸 +忉 +忝 +忞 +忬 +忯 +忳 +怍 +怙 +怛 +怵 +恆 +恊 +恥 +恵 +悆 +悛 +悝 +悞 +悧 +悪 +悰 +悳 +惇 +惔 +惣 +惱 +惲 +愃 +愆 +愍 +愐 +愒 +愔 +愜 +愨 +愭 +愴 +愷 +愼 +愾 +慄 +慘 +慚 +慜 +慟 +慣 +慥 +慮 +慳 +慾 +憂 +憊 +憍 +憐 +憑 +憓 +憕 +憙 +憚 +憤 +憫 +憲 +憺 +憻 +懃 +懇 +懌 +懍 +懐 +懣 +懮 +懲 +懶 +懸 +懺 +懼 +懽 +懾 +戇 +戔 +戕 +戙 +戡 +戥 +戦 +戩 +戰 +戱 +戲 +戸 +戻 +戽 +扆 +扥 +抃 +抇 +抦 +拋 +拏 +拝 +拡 +拺 +挙 +挵 +挹 +挻 +挾 +捒 +捜 +捦 +捨 +捩 +捫 +捭 +捱 +掃 +掄 +掙 +掛 +掞 +掟 +採 +掾 +揀 +揄 +揆 +揔 +揮 +揺 +搖 +搗 +搠 +搢 +搳 +搵 +搶 +搾 +摂 +摜 +摟 +摠 +摭 +摯 +摳 +摴 +摵 +摶 +摺 +摻 +摽 +撈 +撐 +撓 +撖 +撙 +撚 +撣 +撥 +撫 +撲 +撳 +撾 +撿 +擁 +擇 +擊 +擋 +擔 +擠 +擥 +擬 +擯 +擰 +擱 +擲 +擴 +擷 +擺 +擼 +擾 +攏 +攔 +攖 +攜 +攞 +攢 +攣 +攤 +攪 +攬 +攴 +攷 +攽 +敍 +敎 +敔 +敗 +敘 +敫 +敭 +敵 +敻 +敾 +斂 +斃 +斎 +斕 +斖 +斝 +斬 +斷 +斿 +旂 +旃 +旄 +旉 +旙 +旛 +旡 +旲 +旳 +旻 +旼 +旽 +旾 +旿 +昃 +昉 +昍 +昐 +昚 +昛 +昜 +昞 +昡 +昣 +昤 +昪 +昫 +昰 +昺 +晈 +晉 +晊 +晙 +晛 +晝 +晩 +晪 +晫 +晭 +晸 +暅 +暈 +暉 +暊 +暌 +暎 +暏 +暐 +暕 +暘 +暝 +暟 +暠 +暦 +暫 +暱 +暲 +暸 +暻 +暾 +曄 +曅 +曆 +曇 +曌 +曔 +曖 +曠 +曧 +曨 +曩 +曮 +曶 +曷 +曺 +曽 +朊 +朏 +朓 +朖 +朧 +朶 +杁 +杌 +杓 +杙 +杣 +杤 +杧 +杬 +杴 +杻 +杼 +枏 +枖 +枛 +枠 +枡 +枲 +枹 +柁 +柃 +柉 +柊 +柎 +柝 +柟 +柰 +柵 +柶 +柷 +査 +柾 +栃 +栄 +栐 +栒 +栜 +栝 +栞 +栨 +栲 +栴 +栻 +桄 +桕 +桙 +桜 +桝 +桫 +桱 +桲 +桴 +桿 +梀 +梂 +梃 +梉 +梔 +梘 +梟 +梠 +梣 +梫 +梱 +梶 +梽 +棄 +棆 +棐 +棓 +棖 +棗 +棡 +棧 +棨 +棩 +棪 +棫 +棲 +棶 +棹 +棻 +棼 +椆 +椇 +椏 +椙 +椥 +椪 +椲 +椵 +楙 +楡 +楢 +楤 +楧 +楨 +楫 +楮 +楯 +楳 +榊 +榍 +榎 +榑 +榖 +榗 +榘 +榢 +榣 +榤 +榦 +榲 +榿 +槀 +槁 +槃 +槊 +槓 +槔 +槙 +槤 +槩 +槭 +槰 +槱 +槳 +槺 +槻 +槼 +樀 +樁 +樅 +樆 +樋 +樑 +樗 +樘 +樞 +権 +樫 +樺 +樻 +橈 +橐 +橒 +橓 +橚 +橢 +橫 +橿 +檄 +檇 +檉 +檊 +檎 +檜 +檞 +檠 +檡 +檢 +檣 +檦 +檨 +檯 +檳 +檵 +檻 +檽 +櫂 +櫃 +櫆 +櫈 +櫓 +櫚 +櫛 +櫞 +櫟 +櫨 +櫪 +櫱 +櫸 +櫻 +櫾 +櫿 +欄 +欉 +欏 +欒 +欖 +欞 +欥 +欸 +欹 +欽 +歊 +歎 +歛 +歩 +歲 +歳 +歴 +歿 +殂 +殄 +殑 +殘 +殛 +殞 +殟 +殤 +殭 +殮 +殯 +殲 +殳 +毀 +毆 +毉 +毌 +毎 +毐 +毖 +毘 +毬 +毴 +毸 +毿 +氂 +氈 +氍 +氫 +氬 +氷 +氹 +氻 +氾 +汎 +汜 +汧 +汭 +沄 +沆 +沇 +沍 +沒 +沖 +沘 +沚 +沜 +沢 +沨 +沯 +沺 +況 +泂 +泆 +泇 +泐 +泖 +泚 +洌 +洎 +洢 +洣 +洤 +洨 +洩 +洸 +洹 +浄 +浛 +浞 +浟 +浡 +浤 +浯 +浵 +浹 +涙 +涼 +淍 +淎 +淏 +淓 +淛 +淠 +淥 +淪 +淯 +淰 +淵 +淶 +淸 +淺 +淽 +渃 +済 +渉 +渋 +渕 +渙 +渟 +渦 +渫 +渼 +渽 +渾 +湉 +湊 +湔 +湜 +湞 +湣 +湥 +湧 +湳 +湴 +湼 +満 +溁 +溈 +溋 +溎 +準 +溙 +溦 +溲 +溵 +溼 +滀 +滄 +滅 +滈 +滉 +滌 +滎 +滝 +滯 +滲 +滷 +滸 +滹 +滻 +滽 +滾 +漇 +漈 +漎 +漚 +漣 +漬 +漲 +漴 +漵 +漷 +漸 +漼 +漿 +潁 +潑 +潛 +潟 +潯 +潰 +潲 +潽 +潾 +潿 +澀 +澁 +澂 +澆 +澇 +澉 +澋 +澌 +澔 +澗 +澠 +澣 +澥 +澪 +澮 +澯 +澱 +澻 +濁 +濊 +濋 +濘 +濙 +濫 +濬 +濰 +濲 +濶 +濺 +濼 +濾 +瀁 +瀅 +瀆 +瀉 +瀍 +瀏 +瀔 +瀕 +瀝 +瀞 +瀟 +瀠 +瀦 +瀧 +瀨 +瀬 +瀰 +瀲 +瀴 +瀶 +瀾 +灃 +灊 +灑 +灘 +灝 +灤 +灧 +灴 +災 +炁 +炆 +炘 +炟 +炤 +炱 +炲 +炷 +炻 +烉 +烋 +烒 +烔 +烝 +烱 +烴 +焃 +焄 +焌 +焓 +焜 +焞 +焴 +焻 +焿 +煇 +煉 +煐 +煒 +煔 +煕 +煚 +煠 +煩 +煬 +煳 +煵 +煶 +熅 +熇 +熈 +熒 +熖 +熗 +熜 +熤 +熯 +熲 +熺 +熼 +熾 +熿 +燄 +燉 +燊 +燏 +燐 +燔 +燝 +燫 +燬 +燭 +燹 +燻 +燼 +燾 +燿 +爀 +爌 +爔 +爚 +爛 +爝 +爿 +牁 +牂 +牆 +牕 +牖 +牘 +牝 +牠 +牻 +牼 +牽 +犂 +犎 +犖 +犛 +犢 +犧 +犨 +犰 +犴 +犽 +狎 +狓 +狛 +狟 +狦 +狨 +狳 +狶 +狷 +狹 +狻 +猁 +猄 +猇 +猊 +猙 +猞 +猢 +猨 +猳 +猶 +猺 +猻 +獁 +獃 +獄 +獇 +獎 +獏 +獢 +獣 +獬 +獮 +獯 +獰 +獵 +獷 +獺 +獼 +獾 +玀 +玆 +玎 +玏 +玓 +玕 +玗 +玘 +玙 +玠 +玡 +玢 +玧 +玨 +玭 +玶 +玹 +玾 +珅 +珌 +珎 +珖 +珝 +珡 +珤 +珦 +珧 +珪 +珮 +珵 +珹 +珽 +琁 +琄 +琇 +琍 +琎 +琡 +琤 +琱 +琹 +琺 +琿 +瑀 +瑂 +瑆 +瑈 +瑊 +瑋 +瑑 +瑒 +瑝 +瑠 +瑢 +瑣 +瑤 +瑥 +瑧 +瑨 +瑯 +瑱 +瑳 +瑴 +瑺 +璄 +璆 +璉 +璌 +璕 +璘 +璙 +璚 +璠 +璡 +璣 +璥 +璦 +璪 +璫 +璬 +璮 +璱 +璵 +璸 +璹 +璽 +璿 +瓈 +瓊 +瓌 +瓏 +瓑 +瓔 +瓖 +瓘 +瓚 +瓛 +瓞 +甂 +甌 +甍 +甑 +甕 +甡 +甦 +甪 +畀 +畇 +畊 +畋 +畎 +畑 +畝 +畠 +畢 +畧 +畬 +畯 +異 +畳 +畷 +疇 +疊 +疋 +疍 +疒 +疕 +痍 +痙 +痟 +痩 +痲 +痺 +瘍 +瘓 +瘜 +瘞 +瘡 +瘧 +瘰 +瘺 +癀 +癆 +癇 +癒 +癘 +癟 +癡 +癢 +癤 +癥 +癩 +癬 +癭 +癮 +癯 +癰 +癱 +癲 +発 +皐 +皚 +皛 +皝 +皞 +皰 +皷 +皸 +盃 +盋 +盌 +盞 +盡 +監 +盦 +盧 +盨 +盩 +盪 +盫 +盷 +盺 +眀 +県 +眛 +眜 +眥 +眵 +眾 +睜 +睞 +睥 +睪 +睭 +睺 +瞋 +瞞 +瞢 +瞫 +瞼 +瞽 +矇 +矍 +矚 +矧 +矯 +砢 +砩 +砫 +砮 +砯 +砲 +砳 +砵 +硃 +硇 +硏 +硐 +硓 +硜 +硤 +硨 +硭 +硯 +碕 +碡 +碪 +碭 +碸 +碻 +碽 +磔 +磘 +磙 +磜 +磡 +磪 +磯 +磱 +磲 +磵 +磻 +磾 +礄 +礎 +礐 +礑 +礒 +礙 +礠 +礦 +礪 +礫 +礬 +礮 +礱 +礽 +祂 +祆 +祇 +祋 +祏 +祓 +祕 +祧 +祹 +祿 +禃 +禇 +禍 +禎 +禑 +禓 +禔 +禕 +禘 +禛 +禟 +禠 +禤 +禨 +禩 +禰 +禱 +禵 +禼 +禿 +秈 +秠 +秳 +稅 +稈 +稉 +稑 +稘 +稙 +稜 +稟 +稱 +稲 +稺 +稾 +穀 +穈 +穉 +穌 +積 +穎 +穟 +穠 +穡 +穢 +穣 +穫 +窅 +窋 +窣 +窩 +窪 +窮 +窯 +窰 +窶 +窺 +竄 +竅 +竇 +竈 +竊 +竑 +竜 +竦 +竩 +竻 +笄 +笘 +笞 +笥 +笩 +笪 +笭 +笮 +笯 +笱 +笳 +笹 +筅 +筊 +筌 +筍 +筘 +筥 +筦 +筧 +筬 +筭 +筲 +筳 +筶 +筻 +箆 +箇 +箋 +箏 +箑 +箒 +箜 +範 +篊 +篋 +篌 +篔 +篠 +篤 +篥 +篩 +篭 +篯 +篳 +簀 +簃 +簉 +簍 +簑 +簕 +簗 +簞 +簠 +簫 +簷 +簹 +簺 +簽 +簾 +籀 +籌 +籐 +籙 +籛 +籜 +籝 +籟 +籠 +籥 +籪 +籬 +籮 +籲 +籾 +粄 +粍 +粦 +粩 +糀 +糌 +糎 +糞 +糢 +糧 +糬 +糰 +糴 +糶 +糸 +糹 +糺 +糾 +紂 +紆 +紇 +紈 +紉 +紐 +紑 +紓 +紕 +紘 +紙 +紛 +紜 +紝 +紞 +紮 +紱 +紲 +紳 +紵 +紺 +紿 +絃 +絆 +経 +絎 +絕 +絛 +絜 +絞 +絢 +絨 +絪 +絳 +絵 +絹 +絺 +綃 +綈 +綎 +綏 +綖 +継 +続 +綜 +綝 +綞 +綠 +綢 +綣 +綧 +綬 +綮 +綰 +綳 +綴 +綸 +綺 +綻 +綽 +綾 +綿 +緁 +緃 +緄 +緈 +緊 +緋 +総 +緒 +緘 +緜 +緝 +緞 +締 +緡 +緤 +編 +緩 +緬 +緱 +緲 +練 +縂 +縄 +縈 +縉 +縊 +縕 +縛 +縝 +縞 +縠 +縡 +縯 +縱 +縴 +縵 +縷 +縹 +縻 +績 +繃 +繆 +繇 +繒 +繕 +繖 +繙 +繚 +繞 +繩 +繫 +繭 +繰 +繳 +繻 +繼 +繽 +繾 +纁 +纈 +纍 +纏 +纓 +纔 +纕 +纖 +纘 +纜 +缶 +缽 +罃 +罅 +罈 +罉 +罌 +罍 +罟 +罨 +罰 +罳 +罵 +罶 +罷 +罽 +羂 +羆 +羈 +羋 +羕 +羗 +羣 +羥 +羨 +羱 +翀 +翂 +翃 +翕 +翙 +翜 +翬 +翮 +翹 +耎 +耔 +耨 +耬 +聃 +聒 +聟 +聰 +聱 +聳 +聴 +聶 +聽 +聾 +肅 +肏 +肜 +肫 +肸 +肹 +胂 +胅 +胇 +胊 +胙 +胝 +胼 +脅 +脇 +脈 +脛 +脣 +脩 +脫 +脬 +脭 +脳 +脷 +脹 +腧 +腫 +腳 +膂 +膣 +膥 +膩 +膮 +膽 +膾 +膿 +臉 +臍 +臏 +臚 +臞 +臟 +臠 +臯 +舂 +舉 +舎 +舘 +舢 +舥 +舨 +舩 +舲 +舺 +艅 +艉 +艋 +艎 +艏 +艔 +艙 +艚 +艱 +艸 +艽 +芑 +芛 +芨 +芴 +芻 +苅 +苤 +苧 +苳 +苺 +苻 +苾 +茀 +茇 +茈 +茘 +茚 +茛 +茝 +茮 +茲 +茷 +茺 +荅 +荇 +荊 +荎 +荖 +荳 +莕 +莖 +莙 +莛 +莢 +莧 +莩 +莿 +菈 +菉 +菍 +菑 +菔 +菝 +菥 +菫 +菰 +菴 +菶 +菸 +菹 +菺 +菼 +菾 +萇 +萐 +萠 +萡 +萣 +萩 +萵 +萹 +葃 +葊 +葎 +葙 +葜 +葝 +葦 +葯 +葰 +葶 +葷 +蒍 +蒎 +蒐 +蒓 +蒔 +蒗 +蒞 +蒢 +蒧 +蒨 +蒭 +蒯 +蒴 +蒹 +蒺 +蒼 +蒾 +蓀 +蓁 +蓂 +蓆 +蓍 +蓘 +蓚 +蓧 +蓨 +蓪 +蓭 +蓯 +蓳 +蓽 +蔆 +蔎 +蔔 +蔕 +蔘 +蔝 +蔞 +蔣 +蔥 +蔦 +蔭 +蔴 +蔵 +蕁 +蕅 +蕎 +蕑 +蕖 +蕘 +蕚 +蕡 +蕢 +蕩 +蕪 +蕭 +蕷 +蕺 +蕻 +薀 +薆 +薊 +薌 +薐 +薑 +薔 +薗 +薘 +薙 +薜 +薞 +薟 +薨 +薫 +薲 +薷 +薸 +薺 +薾 +薿 +藎 +藟 +藦 +藨 +藪 +藶 +藸 +藹 +藺 +蘂 +蘄 +蘅 +蘊 +蘋 +蘐 +蘓 +蘗 +蘘 +蘚 +蘞 +蘢 +蘧 +蘩 +蘵 +蘶 +蘿 +虉 +虓 +虖 +虛 +虜 +虧 +虨 +虯 +虵 +虺 +蚆 +蚋 +蚍 +蚖 +蚡 +蚢 +蚵 +蚺 +蚼 +蛄 +蛉 +蛍 +蛑 +蛞 +蛯 +蛸 +蛺 +蛻 +蜆 +蜉 +蜑 +蜞 +蜢 +蜣 +蜨 +蜮 +蜯 +蜾 +蝀 +蝍 +蝓 +蝕 +蝘 +蝚 +蝟 +蝣 +蝤 +蝦 +蝨 +蝮 +蝯 +蝰 +蝲 +蝸 +螄 +螅 +螋 +螐 +螔 +螞 +螠 +螢 +螣 +螥 +螫 +螭 +螶 +螻 +螽 +螾 +蟄 +蟅 +蟊 +蟌 +蟎 +蟜 +蟥 +蟪 +蟫 +蟬 +蟯 +蟲 +蟳 +蟴 +蟶 +蟻 +蠂 +蠃 +蠅 +蠆 +蠊 +蠋 +蠍 +蠐 +蠑 +蠘 +蠙 +蠟 +蠣 +蠱 +蠲 +蠵 +蠶 +蠷 +蠻 +衂 +衎 +衕 +衚 +衜 +衝 +衞 +衽 +袓 +袛 +袞 +袴 +袾 +裊 +裎 +裒 +裖 +裬 +裵 +裾 +裿 +褌 +褍 +褎 +褘 +褙 +褞 +褧 +褫 +褭 +褸 +褻 +襌 +襖 +襞 +襠 +襤 +襦 +襪 +襯 +襲 +襴 +襶 +襻 +襾 +覇 +覈 +規 +覓 +覚 +覡 +覦 +覧 +覬 +覲 +観 +覽 +覿 +觔 +觙 +觚 +觜 +觭 +觱 +觴 +觶 +觿 +訁 +訃 +訇 +訌 +討 +訏 +訐 +訒 +訔 +訕 +訖 +託 +訛 +訝 +訟 +訥 +訴 +訶 +註 +証 +詁 +詆 +詈 +詐 +詒 +詔 +評 +詛 +詞 +詠 +詡 +詣 +詥 +詧 +詫 +詭 +詮 +詰 +詳 +詵 +詼 +誄 +誅 +誇 +認 +誒 +誕 +誡 +誣 +誤 +誥 +誦 +誨 +說 +読 +誰 +誴 +誹 +誼 +誾 +談 +諍 +諏 +諒 +論 +諗 +諜 +諟 +諠 +諡 +諤 +諦 +諧 +諪 +諫 +諭 +諮 +諱 +諲 +諳 +諴 +諶 +諷 +諸 +諺 +諼 +謀 +謁 +謂 +謄 +謊 +謌 +謎 +謏 +謐 +謔 +謖 +謗 +謙 +謚 +謜 +謠 +謤 +謨 +謩 +謫 +謬 +謳 +謾 +譏 +譓 +譔 +譙 +譚 +譞 +譫 +譭 +譯 +議 +譲 +譳 +譴 +譽 +譿 +讃 +讌 +讎 +讓 +讖 +讙 +讚 +讜 +讞 +谿 +豈 +豎 +豔 +豢 +豨 +豬 +豳 +豸 +豿 +貐 +貒 +貓 +貘 +貞 +貤 +貧 +貪 +貫 +責 +貮 +貯 +貲 +貳 +貶 +貸 +貺 +貽 +賁 +賂 +賃 +賄 +賈 +賊 +賑 +賒 +賔 +賕 +賚 +賞 +賡 +賤 +賦 +賨 +賬 +賭 +賹 +賺 +賻 +賽 +賾 +贄 +贅 +贇 +贊 +贌 +贍 +贓 +贔 +贖 +贛 +赧 +赬 +趐 +趕 +趖 +趨 +趺 +趼 +跅 +跏 +跗 +跡 +跣 +跩 +踎 +踐 +踰 +踴 +蹕 +蹟 +蹠 +蹤 +蹯 +蹺 +蹻 +躂 +躄 +躉 +躋 +躍 +躑 +躒 +躔 +躝 +躪 +躰 +軀 +軋 +軔 +軛 +軟 +転 +軫 +軲 +軸 +軹 +軺 +軻 +軼 +軾 +較 +輄 +輅 +輋 +輒 +輓 +輔 +輛 +輞 +輟 +輥 +輦 +輩 +輬 +輭 +輶 +輻 +輾 +輿 +轀 +轂 +轄 +轅 +轆 +轍 +轎 +轘 +轝 +轟 +轤 +辭 +辮 +辯 +辵 +辺 +辻 +込 +迴 +迵 +迺 +逈 +逋 +逌 +逎 +逕 +逖 +逤 +逨 +逴 +遄 +遊 +違 +遘 +遙 +遜 +遞 +遯 +遲 +遶 +遷 +遹 +遺 +遼 +邁 +邇 +邉 +邊 +邙 +邠 +邲 +邽 +邾 +郃 +郄 +郇 +郋 +郞 +郟 +郤 +郪 +郳 +郷 +郿 +鄃 +鄆 +鄋 +鄑 +鄒 +鄔 +鄖 +鄗 +鄘 +鄚 +鄜 +鄠 +鄤 +鄧 +鄩 +鄫 +鄰 +鄲 +鄳 +鄴 +酃 +酆 +酈 +酎 +酏 +酔 +酢 +酩 +酴 +酺 +酼 +醁 +醂 +醃 +醅 +醞 +醢 +醣 +醮 +醯 +醾 +醿 +釁 +釆 +釋 +釐 +釒 +釓 +釔 +釕 +釗 +釘 +釙 +釚 +釤 +釦 +釧 +釩 +釪 +釭 +釴 +釵 +釷 +釹 +釺 +鈀 +鈁 +鈄 +鈇 +鈈 +鈉 +鈊 +鈍 +鈏 +鈐 +鈑 +鈔 +鈕 +鈖 +鈞 +鈢 +鈣 +鈥 +鈦 +鈫 +鈮 +鈰 +鈳 +鈷 +鈸 +鈹 +鈾 +鈿 +鉀 +鉆 +鉈 +鉉 +鉋 +鉍 +鉏 +鉑 +鉓 +鉗 +鉚 +鉛 +鉞 +鉟 +鉤 +鉦 +鉬 +鉭 +鉲 +鉶 +鉷 +鉸 +鉻 +鉾 +鉿 +銂 +銃 +銅 +銋 +銍 +銓 +銕 +銖 +銚 +銜 +銠 +銣 +銥 +銦 +銨 +銩 +銪 +銫 +銬 +銱 +銲 +銳 +銶 +銹 +銻 +銼 +銾 +鋁 +鋅 +鋆 +鋇 +鋌 +鋏 +鋐 +鋒 +鋕 +鋗 +鋙 +鋡 +鋤 +鋥 +鋦 +鋨 +鋮 +鋯 +鋰 +鋱 +鋳 +鋶 +鋸 +鋹 +錀 +錏 +錐 +錒 +錕 +錘 +錚 +錞 +錟 +錠 +錡 +錢 +錨 +錫 +錬 +錮 +錳 +錸 +錻 +鍀 +鍇 +鍈 +鍉 +鍍 +鍏 +鍔 +鍘 +鍛 +鍝 +鍟 +鍠 +鍥 +鍩 +鍬 +鍱 +鍳 +鍶 +鍷 +鍺 +鍼 +鍾 +鎂 +鎅 +鎊 +鎌 +鎓 +鎔 +鎗 +鎘 +鎚 +鎛 +鎣 +鎦 +鎧 +鎪 +鎬 +鎭 +鎰 +鎳 +鎵 +鏃 +鏇 +鏈 +鏊 +鏌 +鏐 +鏑 +鏓 +鏗 +鏘 +鏝 +鏞 +鏟 +鏤 +鏦 +鏳 +鏴 +鏵 +鏷 +鏻 +鏽 +鐃 +鐇 +鐈 +鐓 +鐔 +鐙 +鐠 +鐤 +鐦 +鐧 +鐫 +鐬 +鐭 +鐮 +鐲 +鐳 +鐸 +鐺 +鐽 +鐿 +鑀 +鑁 +鑂 +鑄 +鑅 +鑊 +鑌 +鑑 +鑛 +鑠 +鑣 +鑨 +鑪 +鑭 +鑰 +鑲 +鑴 +鑷 +鑼 +鑾 +鑿 +閂 +閆 +閉 +閎 +閒 +閔 +閘 +閜 +閞 +閦 +閨 +閬 +閭 +閰 +閱 +閶 +閹 +閻 +閼 +閾 +閿 +闆 +闇 +闈 +闊 +闋 +闌 +闍 +闐 +闓 +闔 +闕 +闖 +闘 +闞 +闡 +闢 +闥 +阭 +阯 +陁 +陔 +陘 +陜 +陝 +陞 +陬 +陸 +険 +隄 +隈 +隊 +階 +隕 +隣 +險 +隰 +隱 +隲 +隳 +隴 +隷 +隸 +隻 +雋 +雑 +雖 +雛 +雝 +雩 +雫 +雱 +霅 +霈 +霊 +霑 +霙 +霤 +霧 +霨 +霶 +霽 +靁 +靂 +靄 +靉 +靚 +靫 +靬 +靭 +靺 +靼 +鞆 +鞏 +鞞 +鞥 +鞦 +鞨 +鞮 +鞴 +韁 +韃 +韆 +韋 +韌 +韑 +韙 +韜 +韞 +韠 +韡 +韮 +韺 +韾 +頁 +頃 +項 +須 +頊 +頌 +頍 +頎 +頏 +預 +頑 +頒 +頗 +領 +頜 +頠 +頦 +頫 +頴 +頵 +頷 +頸 +頹 +頻 +頼 +顆 +額 +顎 +顒 +顓 +顕 +顗 +願 +顙 +顛 +顥 +顫 +顰 +顱 +顳 +顴 +颮 +颯 +颱 +颶 +颺 +颼 +飆 +飈 +飠 +飡 +飢 +飥 +飩 +飪 +飫 +飭 +飴 +飽 +餃 +餄 +餉 +餌 +餎 +餒 +餓 +餗 +餚 +餛 +餞 +餡 +餵 +餺 +餾 +餿 +饋 +饌 +饑 +饒 +饗 +饞 +饟 +饢 +馘 +馛 +馦 +馭 +馯 +馱 +馳 +馼 +駁 +駄 +駅 +駆 +駐 +駑 +駒 +駔 +駘 +駙 +駛 +駝 +駟 +駢 +駭 +駰 +駱 +騁 +騂 +騄 +騅 +騋 +騎 +騏 +験 +騖 +騙 +騤 +騨 +騫 +騭 +騮 +騶 +騾 +驁 +驃 +驄 +驅 +驌 +驍 +驎 +驒 +驕 +驚 +驛 +驟 +驢 +驤 +驥 +驩 +驪 +骯 +髀 +髎 +髏 +髑 +髒 +髡 +髭 +髲 +髷 +髹 +鬄 +鬅 +鬆 +鬍 +鬚 +鬢 +鬥 +鬧 +鬨 +鬩 +鬪 +鬬 +鬮 +鬯 +鬱 +鬹 +鬻 +魃 +魈 +魋 +魍 +魎 +魕 +魘 +魛 +魞 +魟 +魣 +魨 +魩 +魮 +魯 +魴 +魷 +鮀 +鮁 +鮃 +鮄 +鮊 +鮋 +鮍 +鮐 +鮑 +鮒 +鮓 +鮗 +鮜 +鮟 +鮠 +鮡 +鮣 +鮨 +鮪 +鮫 +鮭 +鮰 +鮸 +鮹 +鮻 +鯀 +鯁 +鯃 +鯇 +鯊 +鯏 +鯒 +鯓 +鯔 +鯕 +鯖 +鯗 +鯙 +鯛 +鯡 +鯢 +鯤 +鯧 +鯪 +鯭 +鯮 +鯰 +鯶 +鯷 +鯻 +鯿 +鰂 +鰃 +鰆 +鰈 +鰉 +鰍 +鰏 +鰒 +鰓 +鰕 +鰗 +鰛 +鰜 +鰟 +鰣 +鰤 +鰧 +鰨 +鰩 +鰭 +鰮 +鰱 +鰲 +鰳 +鰶 +鰷 +鰹 +鰺 +鰻 +鰼 +鰾 +鱀 +鱂 +鱅 +鱇 +鱈 +鱉 +鱊 +鱒 +鱓 +鱔 +鱖 +鱗 +鱘 +鱚 +鱝 +鱟 +鱠 +鱣 +鱥 +鱧 +鱨 +鱮 +鱰 +鱲 +鱵 +鱷 +鱺 +鳧 +鳩 +鳰 +鳴 +鳶 +鳽 +鴆 +鴇 +鴉 +鴒 +鴓 +鴕 +鴗 +鴛 +鴝 +鴞 +鴟 +鴡 +鴣 +鴦 +鴫 +鴯 +鴰 +鴴 +鵂 +鵄 +鵎 +鵐 +鵑 +鵒 +鵓 +鵙 +鵜 +鵝 +鵞 +鵟 +鵠 +鵡 +鵪 +鵯 +鵰 +鵲 +鵵 +鵼 +鵾 +鶆 +鶇 +鶉 +鶒 +鶓 +鶘 +鶚 +鶡 +鶥 +鶩 +鶬 +鶯 +鶲 +鶹 +鶺 +鶻 +鶼 +鶿 +鷂 +鷉 +鷎 +鷓 +鷙 +鷚 +鷟 +鷥 +鷦 +鷫 +鷯 +鷲 +鷳 +鷸 +鸊 +鸌 +鸐 +鸑 +鸕 +鸘 +鸚 +鸛 +鸜 +鸝 +鸞 +鹮 +鹵 +鹹 +鹼 +麅 +麇 +麈 +麊 +麐 +麞 +麩 +麪 +麴 +麹 +麼 +麿 +黁 +黇 +黌 +黐 +黙 +黥 +黧 +黨 +黴 +黶 +黻 +黼 +黽 +黿 +鼂 +鼇 +鼈 +鼉 +鼐 +鼒 +鼕 +鼢 +鼩 +鼯 +鼱 +鼴 +鼷 +齒 +齕 +齡 +齣 +齦 +齧 +齲 +齶 +龎 +龐 +龑 +龔 +龕 +龜 +龝 +龠 +ず +梌 +叀 +晢 +媸 +錾 +鐖 +䰡 +櫬 +锱 +υ +鼗 +媪 +澴 +苈 +眴 +𝜏 +缱 +𝜶 +조 +晡 +≡ +ࠀ +н +廇 +嗛 +篚 +ώ +莰 +윤 +纚 +𢢞 +闼 +熌 +饎 +蓊 +倅 +년 +聭 +耩 +≅ + +≺ +诌 + + + +耰 +菗 +僦 +⇣ +甊 +冓 +缷 +枊 +沕 +𝐴 +❹ +형 +秾 + +щ +厹 + +˗ +疔 +䩦 +髴 +⨂ +莏 +≧ +垆 +銌 +桤 +隤 +ギ +벽 +⑸ +✘ +̣ +辶 +铼 +게 +へ +獶 +藳 +祍 +黉 +跱 +⽬ +埙 +だ +蓣 +亯 +구 + +鹎 + +⾃ +楩 +⌘ +汏 +虒 +谖 + +﹜ +劖 +じ +瑇 +㮑 +揕 +⇔ +𤔲 +薉 +𝑾 +硗 +〈 +は +盍 +狽 +ж +я +挆 +槨 +γ +阏 +襕 +𝜉 +❖ +└ +총 +시 + +ν +刲 +ด +嬲 +绤 +𝐰 +飦 +扱 +帻 +辀 +廴 +к +蔖 +– +같 +熭 +巣 + +裛 +𝑶 +蓺 +蔊 +그 +匳 +玚 +Ц +璲 +련 +𨒅 +변 +㤵 +饫 +𨚵 +X +筇 +镡 +ⅳ +𝛿 +轸 +𝑭 +鋈 +鵩 +縁 +˙ +ɿ +𝒴 +㝮 +𝜂 +栠 +橦 +緇 +肰 + +跼 +䭜 +蜅 +訸 +㻶 +𝑉 +เ + +嚢 +鼔 +𝒆 +閫 +阃 +𥞹 +杪 +誊 + +鲋 +骍 +τ +莾 +凊 +﹡ +箚 +蛱 +樯 +喾 +幞 +欕 +搡 +戉 +瘖 +᙭ +砟 +ས +∤ +ี +メ +𝝁 +穑 +渶 +𦬁 +서 +⊗ +穇 +⌊ +を +鐻 +蘤 +≫ +◐ +汙 +蒒 +⑷ +蹨 +x +裥 +嶤 +ァ +従 +침 +稂 +𪧶 +で +𝑹 +⑫ +闩 +槫 +舮 +𝑿 +戁 +간 +戯 + +ོ +æ +わ +チ +砉 +Ψ +劂 +・ +В +鬭 +钔 +盭 +黓 +⎯ +𝐏 +함 +钪 +𝑸 +澰 +래 +藒 +龃 +瞀 +伧 +♂ +¹ +ƞ +澼 +餍 +倶 +ð + +嚱 +跬 +貙 +磿 +娬 +氿 +鹘 +𝐁 +摅 +ヱ +傰 +พ +湝 +ˆ +Л +翾 +≃ +에 +滫 +С +嫕 +あ +㈣ +ⅇ +垧 +⺮ +∠ +躐 +硌 +眢 +乧 +𝑐 +泃 +轫 +↔ +㎝ +≜ +⽇ +撟 +⟹ +脿 + +㸁 +靯 +う +⁠ +懬 +搷 +瀓 +ˁ +ⅲ +훈 + +お +𝛄 +瓅 +葻 +猋 +ら +⾳ +喣 +⽿ +č +鈎 +⑤ +å +阸 +름 +て +圮 +⚫ +⻄ +胨 +琠 +戄 +箄 +𝒳 +鼍 +й +⼲ +廪 +睃 +囫 +͞ +죄 + +호 +み +饩 + +⊆ +х +欚 +瘚 +≯ +瞗 +ž +嗵 +근 +ま + +⾔ +罥 +ʹ +鼃 +д +✳ +ゃ +悊 +𝐅 +영 +@ +ɣ +𝛷 +𝜁 +ǜ +犄 +⽂ +ཆ +胒 +﹦ +谫 +є +・ +𝐻 +狺 +백 +舳 +𝑁 +ษ + +𝜓 +𝒦 +盕 +유 +𪯐 +茑 +礤 +거 +コ +肂 +鸻 +ã +⑬ +铚 +걸 +磳 +綷 +𝒚 +舭 +腚 +㈩ +榱 +𝐌 +畾 +馐 +罾 +∕ +𝔛 +𝑬 +ç +楬 +櫽 +顼 +阋 + +꺼 +諛 +̌ +้ +㮀 +乵 +沬 +⼀ +ư +鲠 +䜩 +樉 +鹈 +搧 +轾 +䟒 +등 +𝝉 +잠 +짤 +า +蘨 +愪 +ྟ +慪 +鮝 +𝛑 +び +𥞪 +𝐾 +レ +교 +ྲ +달 +𝐩 + +殹 +踇 +狥 +ベ +미 +매 +⑭ +钁 +Θ +못 +𝜇 +侂 +ę +ฟ +邶 +諣 +颃 +𡢕 +昑 +𝒖 +讱 +﹤ +緵 +骢 +朢 +骘 +ℜ + +ゞ +愬 +鹬 + +ッ +ར +급 +‚ +鸶 +蒫 +餽 +蓃 +ข +辠 +ğ +氺 +暆 +笿 +迚 +甝 +ή +徼 +旣 +ϖ +ヲ +倕 +匽 +蓱 +리 +剷 +ู +逪 + +나 +堋 +焠 +Δ +炑 +爫 +蒖 +𝒓 +悫 +𝛱 + +𝐮 +騧 +ⅴ +饾 +贠 +𝚲 +崀 +磀 +柤 +肈 +⻮ +鶄 +狲 +跫 +지 +鳇 +痖 +跂 +秫 +ʒ +합 +ไ +迨 +𝜐 + +屦 +𝐶 +; +辎 +∵ +鴁 +撏 +ς +⟶ +薮 +㟪 +犮 +ب +ビ +藡 +甏 + +眡 +訿 +鉥 +媵 + +柫 +𝒞 +ь +萏 +ค +트 +訮 +汚 +眚 +〞 +き +ほ +刖 +髄 +蘀 +や +ة +诹 +т +ན +𝒃 +掼 +䓁 +僥 +팰 +枵 +✔ +³ +ེ +鼖 +屖 +鍮 +砇 +カ +舐 +牴 +𝜎 +㡿 +攉 +⽤ +晅 +労 +蛕 +𝐽 +Ʃ +く +穽 +孥 +𝒏 + + +ɬ +玦 +檮 +ョ +∥ +중 +萯 +呲 +䰈 +새 + +釶 + +ɢ +⊂ +臮 + +梼 +デ +骖 +ス +蹩 +羼 +▽ +Π +≪ +匛 +𝐼 +稊 +่ +茠 +䢉 +秝 +茐 +齎 +そ + +芕 +噚 +癉 +蹱 +蓜 +𝐬 +ϑ +е +瀋 +ϕ +χ +镟 +霂 +隒 +▱ +ヶ +撄 +둔 +¢ +こ +跲 +莻 +𝑠 +輮 +็ +堠 +푟 +赕 +◦ +ا +런 +帒 +汘 +̱ +尥 +蘠 +𦟜 +옥 +腠 +夨 +⩾ +𝑝 +歯 +刱 +여 +け +溘 +釰 +肍 +擗 +矱 +鍌 +芧 +술 +발 +鼫 +舾 +⼯ +𝝓 +ƒ +怸 +པ +𣐼 +疎 +铷 +Η +⑺ +蒏 +림 +⃛ +゜ +褴 +𨒪 +れ +揢 +さ +櫫 +櫑 +䋎 +灋 +櫜 +诓 +❶ +𝐃 +Q +袳 +ℒ +菂 + +荙 +ℛ +⁄ +堙 +贋 +̅ +鳏 +̂ +、 +茍 +泜 +𝑈 +즉 +噔 + + +迓 +Ⅸ +❷ +이 +_ +⾊ +Ö +铥 +耹 +䶮 + +무 + +セ +饳 +อ +篾 +통 +‒ +ย +덕 +말 +艨 +Ω +𝐨 +螓 +澐 +巠 +⋅ +钶 +도 +鸱 +齍 +恑 +褛 +剟 +준 +勶 +𠟠 +ß +箅 +𝑆 +悃 +蘥 + +Ξ +𝑘 +妣 +𝑖 +𝐑 +纡 +釿 +⺌ +ヴ +𝕀 +涻 +箙 +塚 + +⼠ +墈 +∷ +疴 +ク +ㄕ +𝒂 +蒪 +蓡 + +鷇 +瘏 +𣹳 +橰 +嵚 +帀 +주 +ド +盓 +爇 +φ +觋 +𝜑 +钍 +화 +표 +Ɛ +篰 +명 +週 +с +蓛 +裢 +穜 +㱃 +玊 +鲕 +蒕 +箪 +⑯ +苽 +矦 +偰 +盝 +佊 +僨 +駉 +𝑳 +머 +ª +絅 + +锒 + +苆 +ั +𝛻 +碹 +咺 +竝 +и +づ +강 +辁 +́ +铽 +纩 +齑 +𝝎 +어 +ユ +躡 +𝒄 +ซ +畛 +鸰 +ླ + +骉 +❸ +揲 +廃 +湋 +𝑲 + + +旤 + +蹷 +钌 +국 +豙 +鬳 + +ɛ +轳 +俜 +眄 +萮 +𝐡 +颵 +箓 +魑 +𝑅 +漍 +ℤ + +逡 +학 +浖 +ょ +¬ +怴 +𝛤 +怿 +祌 +纥 +𝒑 +⃑ +棅 +笵 +낭 +栦 +⑰ +บ +𝔽 +𝑇 +埝 +⽓ +孱 +埶 +匜 +鸼 + + +벌 +ル +锸 +斫 +妟 +뽀 +昬 +댁 +ʂ +暯 +夳 +ノ +堞 +懘 +榼 +鞫 +오 +𝑡 +偑 +戗 +∴ +伥 +끝 +𬌗 +稯 +岜 +Ε +犲 +𩓞 +연 +鹚 + +ག +诜 +嗍 +倥 +鳣 +庑 +屾 +雚 + +椄 +颏 +酤 +𝒋 +欛 + +း +려 +缋 +¾ +ゴ +籑 +笤 +鞛 +鏺 +蓒 +설 +緍 +⑩ +迀 +鼋 +ɮ +위 +锪 +∨ +滆 +€ +躅 +鋓 +柀 +䐶 +啎 +𝛵 +骃 +ć +갈 +卨 +い +𝑺 +鸲 +壻 +偯 +𝑞 +譖 +곤 +溍 + +噫 +순 + +𝑽 +ы +赑 +蓸 +鸮 +稃 +っ +詗 +으 +⨀ +屮 +俦 +伛 +畱 +늬 +𝑂 +朼 +沰 +겨 +з +骀 +鸩 +𝜈 +º +苊 +诎 +皤 + +하 +̀ +砑 +凷 +翄 +𝑛 +赪 +≮ +浗 +𝐍 +û +オ +ƹ +𝜅 +묘 +曛 +鳊 +𝛩 +癹 +磒 +ば +⑨ +礆 + +乼 + +∽ +褱 +藴 +縶 +觥 +に +식 +凫 + +佥 +槷 +阍 +䰍 +졸 +전 +葢 +㝸 +も +⻔ +遽 + +蹰 +𝛺 +裏 +། +를 + +ろ +짭 + +ぐ +싶 +渰 +⊤ +浳 +൯ +∃ +옛 +蟞 +과 +芠 +飖 +⼆ +敶 +粝 +𥃩 +坿 +䩉 +𝑯 +「 +矰 + +사 +𝛶 +𝑎 +挐 +푎 +동 +ℝ +Γ +︃ +珒 +鹍 +κ +鑓 +傁 +惓 +臿 +丣 +悒 +侔 +ñ +訳 +櫭 +賛 +觏 +辂 +覅 +濓 +堿 +擪 +฀ +𝑵 +扨 +嫫 +珰 + +寃 +𝒔 +曱 +髣 +인 +≌ +莵 +踳 +ⅱ +Ø +⌋ +¯ +挢 +̇ +﹪ +哕 +𦫳 + +襛 +昳 +铙 +铫 +軱 +汔 +ネ + +躩 +옷 +ถ +엄 +皊 +臑 +𧄝 +𝑃 + +䢅 +𝐝 +𝒍 +ℱ +𝐓 +蓾 +𝑻 +䋁 +裼 +개 +ത +𝒊 +僪 +瞂 +𦞠 +요 + ̄ +荍 +𝜔 +ф +峣 +庋 +檏 +袢 +绬 +Σ +향 +钫 +え +枅 +≝ +荦 +들 +勍 +ö +𝒕 +툰 +遬 +𝐵 +擧 +咢 +钘 + +𝒢 +Ⅷ +➢ +讧 +ω +簟 +廐 +刳 +阘 +б +⊘ +髟 +臓 +루 +⎧ +诳 +у +诮 +蠪 +梹 +耤 +パ +ن +∆ + +𝑫 +น +べ +坼 + +𝑤 +褽 +憼 +심 +∇ +迖 +휆 +叚 +없 +⼿ +钖 +斠 +䪵 +胠 +𝜋 +殽 +剜 +⾝ +− +慸 +𝛽 +椔 +⟩ +皦 +筚 +奰 +Å +물 +𝒐 +嫱 +钆 +ï +∪ +⇢ +ş + +㖞 +璗 +葸 +殢 +𝜺 +夲 +骒 +ち +회 +선 +睒 +轡 +ξ + +鲧 +镞 +碜 +놈 +Å +紴 + +⇤ +ྷ +⑪ +喟 +𦼰 + +蔩 +埦 +𝜆 +耋 +˜ +한 +舣 +馓 +⑻ + +ɐ +椘 +し +莐 +辔 +憰 +碛 +⁃ +飏 +颀 +跽 +⇥ +赀 +撺 +襜 +ɒ +袧 +л +정 +꾸 +콩 + +박 +缑 +柈 + +樲 +𝑮 +詘 +µ +𝑷 +鹪 +𝛼 +차 +讬 +掯 +硎 +𝑨 +舄 +‹ +누 +バ +ก +萀 +兇 +숙 +貍 + +踈 +친 +𝜽 +摰 +甿 +坜 +遑 +삼 +배 +Μ +을 +譊 +沩 +빈 +飑 +钹 +镨 +鐉 +宭 +桗 +ə +歺 +А +⇒ +锞 +𝒪 +棊 +愓 +莶 +琲 +འ +プ +་ +𝐿 +艟 +欬 +ิ +в +ų +纻 +㎎ +婄 +Ρ +歟 +椢 +粜 +종 +خ +ね +剞 +베 +斄 +幠 +ト +疛 +よ +╳ +醳 +군 +諂 +芰 +穋 +禆 +길 +秊 +噙 +y +锓 +⁵ + +拠 +Ĥ +𝑒 +窬 +抈 +︰ +퐶 +铳 +각 +ش +錉 +ù +臝 +闶 +𝒟 +芐 +韎 +권 +萚 + +ど +羮 +镕 +欔 +瘐 +받 +𝑚 +𢦟 +髤 +腙 +⽽ +상 +铘 +장 +𤇈 +ニ +凂 +ȷ +육 +а +살 +雠 +荑 +태 +穤 +ɯ + +圬 +楑 +단 +ง +⾯ +λ +⁰ +성 +萿 +缌 + +毣 +矅 + +푚 +˘ +貣 +∂ +은 +ė +䌛 +경 +せ + +拫 +⅞ +餕 +鐨 +翭 +ื +ɵ +⺍ +Փ +▬ +ว +희 +짐 +屙 +洫 +ေ +∏ +臜 + +剸 +芓 +운 +∓ +계 +祔 +鶵 +𝝅 +柂 +訢 +禊 +扽 +恫 +𝜙 +狢 +勠 +ི +𝜒 +จ +铯 +ྱ +𝑙 +蟇 +울 +莤 +牱 +𝒗 +詇 +靃 +殓 +栍 +踟 +ي + +鲄 +㓷 +贳 +ナ +鲓 +𝒙 +薁 +Χ +侪 +恌 +㰤 +목 +̄ +丱 +― +𝛔 +𝑔 + +鸷 +﹣ +籢 +脢 +δ +窭 +‐ +阒 +석 +아 +ォ +두 +𝐦 +浬 +搰 +褃 + +ལ +乇 +腘 +眊 +偬 +Ⅻ +ℳ +畤 +芟 +曈 +飧 +堌 +═ +谶 +櫝 +嬑 +冋 +嗌 +抜 + +腜 +공 +𝜕 +ん +鲭 +郐 +酓 +𝑍 +⾏ +⼹ +㐬 +고 +𝟑 +缯 +碤 +濩 +ʰ +佻 +Υ +∗ +賅 +집 +跹 + +ɾ +蔧 +다 +栫 +庰 +欤 +洿 +捾 +𝜍 +𝑄 + +攆 +夂 +檿 +荜 +ц +柖 +唅 +ท +ɦ +讦 +습 +锿 + +纆 +檑 +殰 +歠 +鼑 +Ä +و +☑ +緦 +悁 +偞 +ส +絭 +저 +踯 +騀 +쉰 +蒷 +揗 +儵 +ρ +薅 +ャ +‗ +犒 +旟 + +승 +ང +소 +𝛴 +瀜 +锜 +𣱼 +谳 + +軑 +ポ +楁 +𝑜 +袚 +ྐ +Á +𝑑 +鲀 +牾 +鬌 +푥 +¤ +呴 +‑ +✓ +민 +⼦ +ⅰ +⽉ +擿 +ч +➝ +가 +≳ +漥 +踖 +枧 +莝 +⻘ +傧 +𝑢 +ю +赍 +q +絫 +о +ア +ℐ +髫 +齢 +湎 +甓 +揿 + +ℋ +怹 +자 +⑦ +져 +椟 +鶟 +浕 +ー +𝛂 +偾 +⃗ +喑 +鹡 +≦ +磽 +ⅆ + +葂 +鶱 +ン +貇 +褡 +▴ +것 +喿 +つ +闚 + +盳 +𝟒 +雔 +洭 +殫 +楎 +£ +^ +葲 +𝟖 +眗 +棸 +潏 +熕 +𝟐 +품 +သ +樳 +⁴ +イ +㈢ +렴 +脰 +돈 +⑮ +钲 +𝒘 +訽 +爟 +幨 +枻 +亷 +猃 +σ +黩 +嘑 +榹 +⁡ +鍧 +𝑋 +枘 +𝑥 +원 +睚 +饔 +酲 + +顸 +람 +𝐫 +脁 +½ +긴 +ʔ +Ⅰ +旆 +죽 + +궐 + +奡 +㭃 +杝 +忾 +ม +掮 +饍 +摛 +쓰 +慊 +踣 +푅 +悽 +礅 +毄 +俓 +데 +冣 +만 +驖 +𤉣 +̃ +廾 +匵 +阇 +𤸫 +戣 +刌 +剕 +杅 +ο +蒥 +ː +癃 +蒬 +â +À +嗥 +우 +ケ +」 +聩 +ë +吽 +檌 +苰 +⑹ +Ÿ + +⑥ +노 + +˄ +鸫 +廛 +㱿 +鹛 +赟 +℅ +菿 +辳 +阼 +𝒇 +哋 +♀ +氕 +砤 +† +舡 +偝 +飜 +넓 +鈜 +ầ +닝 +禚 +匲 +〉 +Ф +锊 +ϵ +∙ +惛 +箧 +𝑦 +宬 +𝑀 +臙 +𩡶 + +¡ +潀 +수 +敃 +か +أ + +苌 +饘 +咝 +𝑼 +∘ +涷 +樍 +厣 +蝼 +墻 +Ñ +秅 +︒ +∅ +↵ +葹 +ỽ +𝑗 +た +일 +蒊 +치 +竢 +¨ +佢 +潵 +櫼 +軵 +𧕿 +倨 +歱 +瘅 +𝐭 +黾 +脼 +ê +땋 +鶷 +ё +鹯 +掲 +\ +𨳡 + +Г +ι +탁 +溞 +殪 +菭 +𝛥 +擛 +録 +㥥 +∀ +锇 +锃 +편 +餬 +瘻 +ཟ +豤 +로 +ɸ +ℎ +랑 +ʃ +鼹 +臬 +ŋ +巵 +譁 +w +窳 +蓔 +䉜 +浧 +酂 +⒀ +མ +椠 +槖 + +衄 +𨐨 +̿ +ご +⺗ +顇 +𝒫 +搕 +ミ +𪪋 +« +䣛 +鹩 +鴈 +п +는 +䋰 +𝛹 +犕 +呌 +𝒮 +𝑪 +鋎 +嚻 +杚 +䕊 +蠜 +ザ +𝐂 +☐ +𥘔 + +赜 +Ν +廦 +瓾 +↦ +龉 +⽅ +棂 +𝜌 +큰 +踔 +ラ +。 +剤 +황 +⅜ +僈 +骈 +ɕ +О +м +椑 +𝑟 +纇 +𝑓 +𝐖 +Ш +⎦ +旹 +삶 +ึ +囵 + +す +ⅈ +ت +踽 +陴 +餱 +ป +막 +紟 +방 +剀 +簖 +闬 +キ +鋉 +납 +タ +谵 +詑 +족 +垔 +荋 +旰 +𥘸 +窾 + +신 +𝐎 +𝛃 + + +﹒ +縰 +猲 +郘 +파 +⊕ +镘 +𠊃 + +呔 +𝜗 +ʊ +𝛬 +辏 +陭 +𝑕 +庴 +ʐ +瀌 +倄 +蕞 +ل +絷 +러 +든 +བ +柅 +› +傚 +睠 +Ⅺ +饐 +蔮 +ɟ +莈 +𤨨 +⋯ +犭 +𝜃 +𥹳 +초 +⎞ +遌 +眇 +蓗 +綅 +토 +裰 + +⼼ +虘 +𝑌 +觯 +漶 +钤 +讒 +げ +螬 +鲹 +咷 +蓞 +僂 +𝒉 +が +桮 +포 +쟈 +柽 +ウ +綟 +缟 +䁾 +钭 +烕 +厠 + +孭 +礉 +­ +谲 + +⼤ +𝒒 +旒 +㫄 + + +鳓 +挈 +재 +ད +𧊒 +蝝 +𝐺 +懱 +芢 + +ⅼ +Ú +𝑱 +翯 +芶 +厽 +遉 +鲒 +η +𝛾 +趮 +虆 +汸 +嬖 +糈 +窸 + +추 +棬 +懔 +硁 +ぶ +抟 +胕 +𝑧 +⌦ +碫 +Ⅵ +속 +𝐚 + +Ç +행 +Ɵ +⑱ +贽 +箤 +р +敒 +衤 +풍 +⊛ +慉 +ψ +© +광 +ℕ +屣 +臌 +旵 +臁 +‡ +癎 +閡 +𡵂 +襐 +畟 + +萪 +娒 +瘼 +庳 +천 +觌 +Α +と +奁 +煃 +؟ +◯ +의 +攎 + +𝐞 +J +𢦒 +❑ +벤 +𝐒 +リ +蒉 +𝐱 +朹 +㈤ +„ +䗬 +Ι +ཀ +𡜵 +俣 +疬 + +墥 +吣 +У +榀 +絟 + +旓 +𝐛 +𝜷 +瑮 +≔ +笾 +ζ +김 +暵 +𝜹 +逶 +萙 +欇 +俧 +籴 +絰 +揶 +ǔ +宂 +伩 +Ө +菞 +梕 +エ +蹚 +제 +Š +沝 + +𦳢 +𝒱 +揠 +ℏ +𝐹 +箝 +규 +氒 +⼊ +鰌 +筮 +⼩ +대 +𝔾 +䄃 +𝐸 +﹕ +부 + +刓 +ȵ +缛 +기 +缊 +𝟎 +𨟻 +め +捃 +⽚ +鍪 +灆 +迻 +⾦ +荗 +v +𡒊 +汍 +斲 +姕 + +儴 +偒 +辤 +芀 +蝥 +ń +臥 +椀 +㪚 +š +담 +ø +䈰 +睏 +テ +﹐ + +참 +楘 +𝒌 +劓 +ɪ +醑 +绹 +諓 +𝛉 +ズ +怼 +埘 +臽 +잡 +镢 +𝜖 +진 +踬 +谠 +﹥ +髺 +腞 +현 +嘭 +ʑ +蓌 +〜 +锠 +蓶 +る + +∼ +枎 +緗 +薠 +芈 +耪 +𝒎 +謼 + +瘳 +诨 +忤 +œ +⇡ + +鲣 +ⅵ +Τ +㯮 + +㶲 +ⅹ +䙴 +坴 +馑 +缹 +椦 +ô +⼈ +フ +誏 +э +哙 +愎 + +埽 +祲 +마 +殍 +菋 +懑 + +辇 +鍤 +𝜀 +ɜ +り +𝐷 +㕞 +瑵 + +蔨 +Ⅶ +镴 +ภ +𝝀 +𢶡 +⃝ +ơ +柢 +𧴗 +ʁ +攙 + +な +𝑏 +挴 +餧 +絇 +怄 +曏 +洟 +軷 +∉ +咍 +⎪ +樛 +𝑣 +웃 +椊 +黢 +𝑩 +誩 +伓 +戠 +橥 +⟨ +豰 +懥 +涖 +綘 +詬 +ွ +˚ +刽 +ɑ +격 +稖 +𝟏 +禝 +墦 +𝑊 + +択 +檙 +∝ +颟 +诂 +𝐧 +踲 +𝜛 +𝑰 + +鲬 +⁸ +ǎ +문 + +紬 +楲 +䊭 +枨 +膑 +õ +던 +Ⅴ +秏 +馔 +拊 +缗 +隠 +牀 +淲 +鬰 +綵 +鶑 +詎 +慙 +劒 +閲 +鎡 +淒 +屨 +鉢 +扃 +鳬 +閤 +馿 +翛 +駸 +蛩 +驂 +嵲 +覩 +牋 +湲 +蓴 +賸 +遡 +翫 +嫋 +惻 +妬 +罇 +龏 +鵷 +閙 +鎸 +朅 +巉 +僞 +洶 +磧 +筯 +慇 +鷁 +斾 +斸 +酹 +幘 +羶 +閽 +薤 +泝 +覯 +怱 +缾 +氳 +躊 +檝 +擣 +虀 +誚 +甃 +艤 +櫳 +醖 +壚 +涴 +崦 +秪 +潄 +濆 +駡 +坰 +闉 +縑 +躕 +颻 +燠 +輙 +鏁 +嶮 +薖 +輈 +綆 +覷 +蹔 +攄 +鐩 +鑱 +羃 +轓 +麤 +驀 +欵 +亙 +朮 +邐 +箠 +怳 +鋩 +鷃 +篘 +蔌 +諄 +旐 +慍 +欷 +頽 +蜺 +覊 +禋 +秔 +蜩 +嚬 +櫺 +軿 +痾 +笻 +猱 +毳 +泬 +竚 +齪 +搘 +欻 +釂 +嚥 +誑 +籩 +韉 +幙 +舠 +飣 +颭 +颸 +歔 +屧 +巇 +艫 +壖 +牓 +薝 +銛 +皪 +枿 +剗 +歘 +鸂 +邅 +衒 +荄 +鴂 +嫗 +顦 +瀼 +繄 +搆 +狖 +貰 +醆 +秖 +蹀 +頳 +纒 +憇 +溽 +澦 +讐 +灩 +箎 +螿 +鰥 +飀 +澒 +矻 +枌 +擡 +鷖 +齬 +纊 +挼 +齟 +錙 +屩 +蠧 +氅 +漭 +躚 +翺 +瘵 +螘 +鵶 +㶁 +斵 +饜 +岏 +䍦 +哢 +彴 +豗 +靨 +鋋 +禳 +覘 +鞚 +擻 +涘 +溷 +沴 +嶓 +褊 +罏 +齏 +醲 +繅 +舴 +釃 +厖 +闤 +閴 +藂 +譍 +糲 +籞 +躞 +餳 +遰 +倐 +嘖 +鷀 +暍 +韝 +蘺 +齁 +醽 +醨 +憀 +䕷 +跕 +拶 +垤 +鸎 +漙 +躭 +傴 +蕕 +嘒 +晻 +糵 +閈 +嫠 +斁 +鶗 +詶 +囘 +羇 +橛 +鞬 +磈 +粃 +阨 +塿 +敧 +氊 +芼 +襆 +迍 +鬛 +憒 +釅 +蓐 +奬 +頲 +髠 +抔 +葅 +槧 +跧 +揜 +渇 +餔 +罝 +裯 +蹁 +椶 +幰 +逰 +饁 +棃 +噀 +轔 +囁 +惸 +騑 +呪 +鬐 +綌 +醤 +䆉 +艣 +鐶 +夐 +摐 +鸇 +睎 +羝 +紼 +鞿 +噉 +磑 +闗 +筩 +駮 +蹌 +縢 +闠 +鬙 +谹 +榾 +觳 +皁 +晼 +啗 +簣 +騕 +蹣 +麰 +觧 +怊 +朞 +鱍 +蟣 +畚 +蠁 +舁 +瞇 +劚 +鰐 +籯 +鬖 +柮 +飱 +帟 +酇 +崿 +霪 +緌 +踆 +欃 +縟 +搦 +琖 +搥 +倀 +觫 +遝 +嚅 +聵 +藋 +筴 +喁 +窻 +穏 +牥 +鎩 +礲 +膴 +鞵 +醵 +斚 +縲 +裀 +齷 +騃 +袠 +谺 +靦 +帬 +鯈 +曀 +灔 +崷 +趂 +徯 +闃 +洧 +獪 +稏 +煢 +歈 +呶 +壈 +襃 +旴 +檟 +簦 +谽 +箵 +謡 +慝 +餖 +稌 +朣 +禖 +嚀 +嵂 +黷 +甖 +洑 +釡 +蕝 +甆 +翣 +篸 +隮 +滃 +裌 +蔀 +籖 +秬 +鷴 +啅 +慼 +捄 +咮 +睟 +譎 +嘷 +駃 +檥 +蹐 +窊 +駞 +雘 +趯 +篲 +讋 +睍 +毰 +憗 +鳷 +嚲 +圞 +歃 +緼 +賫 +籋 +繐 +麏 +灕 +礧 +歜 +飇 +鵁 +疢 +賖 +窆 +螮 +毹 +硉 +耡 +甔 +篛 +娭 +髩 +燋 +輜 +籧 +撝 +嬙 +徃 +驦 +𡏖 +麕 +馹 +覔 +鶠 +褷 +綍 +螗 +嗈 +彯 +篨 +炰 +鄮 +噞 +尅 +鷰 +鋭 +饉 +霢 +䔩 +坱 +裓 +帨 +忺 +豅 +栱 +謦 +傯 +誵 +骭 +潩 +鬒 +嵫 +悮 +扊 +扅 +轢 +惝 +臈 +舃 +鞾 +譟 +袵 +眎 +簏 +埸 +堧 +憸 +雰 +腷 +嵓 +隩 +趄 +墐 +褦 +艑 +狴 +玿 +竪 +恧 +姱 +抆 +恓 +霣 +躓 +鞲 +晬 +簴 +唼 +曵 +褕 +罣 +縐 +衘 +巃 +攲 +輀 +貎 +哳 +醭 +鋣 +僛 +迕 +蠭 +膓 +欝 +洊 +敺 +纎 +栟 +鞓 +蛬 +灺 +痏 +恡 +踸 +霔 +濵 +昻 +鉘 +楖 +竛 +竮 +窱 +幬 +慤 +儗 +黮 +嘐 +睆 +頇 +麑 +壼 +㦸 +顋 +瘥 +苖 +韈 +盻 +袷 +矼 +塼 +鐍 +傞 +苶 +吷 +噇 +鶖 +僣 +髧 +䅉 +鯫 +襏 +縳 +蠨 +痡 +髽 +剉 +蝱 +鄽 +匼 +嚚 +襫 +缿 +鵊 +燖 +忸 +摝 +攅 +牷 +氎 +騣 +颿 +虡 +腯 +漘 +矓 +祫 +顢 +綀 +弮 +柙 +蔾 +胾 +筤 +馽 +砆 +冩 +賙 +唶 +麛 +簜 +蹏 +屼 +鞶 +煑 +踠 +愀 +蠒 +頬 +韲 +戞 +畆 +笊 +搨 +捽 +絙 +覉 +澨 +趫 +矹 +穮 +愠 +劘 +轣 +卭 +鼪 +杕 +轗 +擐 +蚿 +恚 +檛 +𩕳 +靆 +轕 +餼 +頮 +槹 +蔉 +皜 +扄 +鮆 +轑 +蹡 +嵽 +甎 +蟈 +橅 +笴 +膰 +蕣 +澘 +髿 +樕 +褵 +蜋 +窼 +櫧 +雊 +胷 +嘵 +麄 +裋 +繢 +啐 +臛 +簁 +巓 +羜 +攧 +簮 +壊 +齩 +晹 +臲 +鬵 +齠 +媮 +幮 +壍 +蠛 +槜 +羖 +窓 +隃 +嚘 +輳 +籹 +凴 +崕 +獍 +嗸 +趦 +囅 +戺 +涬 +諉 +箯 +輊 +桹 +嵷 +㲲 +愊 +蒱 +洚 +赩 +輴 +幈 +齔 +嗁 +阽 +圠 +荈 +碔 +揎 +巀 +洏 +卼 +𨁝 +痁 +黳 +嗾 +䆗 +戃 +蕆 +頋 +悤 +掎 +㯝 +吚 +猘 +鮎 +鬴 +墁 +飋 +呿 +窀 +沲 +枒 +窌 +爼 +頞 +譡 +鶋 +湩 +㦬 +僾 +斒 +醼 +鶂 +磥 +揫 +犗 +齗 +鄶 +囏 +崪 +爞 +籓 +斮 +觝 +嵸 +驔 +䨴 +頺 +萑 +珓 +牸 +闒 +凘 +悢 +蟭 +濈 +嬄 +翽 +旍 +鶢 +罋 +輠 +怩 +頖 +趍 +壝 +嫮 +蕋 +踦 +轇 +眘 +巗 +嶭 +糓 +甽 +籺 +矟 +佖 +絏 +憮 +懡 +駈 +擕 +淟 +皡 +膋 +潨 +鳲 +趠 +麨 +頩 +漻 +輗 +墄 +賮 +㴩 +莟 +縦 +岝 +醻 +曚 +䙰 +噭 +醥 +筰 +躧 +踘 +鑕 +咈 +僶 +鶊 +鬂 +聼 +騐 +壒 +磎 +歗 +淈 +隟 +狃 +縋 +媻 +趲 +惙 +呫 +聮 +羾 +尫 +佽 +髼 +繋 +鬘 +旜 +疐 +阬 +䰐 +塈 +徤 +祊 +灂 +祅 +樷 +颾 +凟 +頀 +蠏 +塒 +衹 +婬 +裩 +粞 +憯 +匭 +筈 +盬 +霮 +黕 +靮 +伻 +緺 +瘝 +羑 +醸 +樝 +僎 +絓 +噆 +愞 +痗 +愽 +岊 +黤 +湑 +搉 +綯 +焮 +疉 +楛 +玼 +喤 +粔 +飂 +贐 +緉 +覰 +胔 +鞳 +摑 +墢 +斅 +誶 +僝 +鹺 +諌 +齅 +嵼 +讟 +冦 +脝 +婣 +緐 +茰 +飶 +欎 +慁 +抝 +瘉 +𡎺 +鈯 +瘃 +麫 +匊 +窞 +羓 +氄 +嚌 +姤 +橑 +駬 +冺 +騠 +㕙 +楶 +靸 +圎 +尀 +䙀 +鄏 +軃 +竁 +㹠 +刜 +剨 +罛 +鏹 +鬉 +簨 +藭 +藷 +僇 +瀫 +袨 +忮 +冡 +齯 +楪 +囋 +蟉 +醱 +尠 +牣 +攟 +袿 +齾 +甞 +啑 +潚 +樐 +絖 +酖 +觖 +骹 +嶅 +玃 +嫜 +廹 +儤 +矂 +艓 +挱 +骳 +嵳 +洴 +礓 +厪 +﨑 +禜 +籊 +瓻 +彛 +狁 +腪 +骾 +娯 +罻 +璅 +簳 +姢 +猰 +眹 +䴥 +堘 +搯 +怤 +緫 +聫 +涊 +熛 +輤 +䡾 +譌 +髇 +攛 +稭 +媕 +鬷 +跰 +縚 +鉧 +踧 +嚭 +襮 +藞 +滮 +颷 +荂 +蓰 +怫 +閧 +臕 +稛 +怗 +闑 +抶 +薶 +嶕 +瀺 +𥫗 +墝 +埆 +皥 +惷 +鞔 +鞺 +蟛 +瀡 +鎁 +酧 +恝 +齓 +嚄 +簔 +蟺 +㔶 +胹 +憖 +惄 +鸒 +貛 +軏 +縗 +蓻 +娵 +抺 +鼛 +虩 +歕 +矑 +繂 +襚 +倂 +廥 +諝 +虗 +弜 +兟 +繿 +偘 +翶 +肻 +棙 +斆 +碨 +醎 +蟢 +銙 +躠 +櫩 +椮 +絀 +鷾 +溳 +詖 +葓 +谼 +𦩘 +韔 +翿 +呑 +馡 +騊 +堁 +蓏 +䟃 +頟 +渢 +趑 +堄 +滛 +擫 +豭 +騩 +騘 +䍧 +彍 +忭 +餙 +馺 +忩 +芣 +矴 +噂 +滍 +慫 +𨍏 +怲 +扵 +搊 +昩 +嶻 +禬 +憃 +忼 +榰 +箾 +撁 +鈆 +袗 +脤 +騞 +哤 +螀 +靧 +梲 +囦 +魖 +褠 +䭔 +煆 +挃 +宷 +熉 +朘 +憭 +䒠 +謭 +鷤 +拕 +骫 +穾 +襭 +喓 +冞 +勩 +媢 +麚 +椓 +俙 +幐 +磝 +蜎 +灙 +漦 +㛹 +䭀 +㜷 +粻 +懟 +箳 +滣 +糉 +冐 +韤 +湱 +糭 +栳 +勌 +慱 +㸌 +罫 +筞 +霿 +躶 +玞 +磉 +罦 +祴 +媟 +猒 +擭 +恠 +嵁 +屴 +慆 +庬 +蟁 +㹀 +薧 +鷕 +渻 +朂 +愯 +齚 +蝻 +胏 +饙 +鳦 +鸃 +叅 +肧 +篂 +脗 +雺 +飰 +筀 +頥 +毶 +弌 +逓 +瞍 +絁 +鏚 +嚵 +攂 +醄 +奼 +獫 +絣 +靷 +畮 +褉 +棁 +揑 +楥 +橤 +襥 +蹮 +窔 +忪 +潠 +杇 +黲 +擄 +蚻 +蘙 +虙 +袐 +陿 +帊 +醟 +髖 +㞦 +鱭 +譸 +鮞 +栧 +扺 +脽 +擉 +岨 +黈 +餻 +佪 +遻 +鋟 +瞶 +廽 +懨 +墖 +玁 +籉 +宼 +鰋 +瑽 +垖 +酕 +漰 +戹 +蝛 +瑲 +阤 +褆 +儛 +䍽 +觕 +箘 +碯 +灨 +燀 +膇 +韀 +䳏 +詿 +禂 +韣 +踡 +碏 +尵 +莭 +庻 +篿 +狤 +㘞 +艭 +殱 +鵔 +槮 +猧 +劙 +獝 +㭊 +㾕 +蠚 +帤 +蹢 +蛚 +輼 +麀 +檃 +䰒 +䪫 +悾 +濳 +杗 +揾 +駏 +撦 +耈 +蟟 +狌 +鸖 +䨥 +餫 +鍰 +耉 +毚 +袽 +䱥 +慓 +䔿 +艖 +舋 +弰 +蠺 +嫓 +蚳 +髾 +喨 +鴐 +䍥 +韍 +柹 +掁 +薋 +攕 +飺 +凢 +麌 +嫰 +鑚 +黦 +葠 +吿 +栰 +踶 +芿 +穭 +啝 +筓 +褁 +稇 +顚 +䎘 +恇 +珷 +緪 +墠 +蛣 +蛜 +讕 +疻 +惎 +袝 +霡 +罸 +鬽 +苢 +喭 +飊 +唎 +澾 +襋 +皭 +廏 +蔿 +穊 +䝟 +駊 +獹 +夣 +褾 +慴 +軥 +讁 +軰 +瞷 +𡋯 +晜 +潗 +衋 +揵 +覼 +鱐 +醡 +䏰 +侐 +亁 +桞 +驘 +鬋 +鷽 +懞 +㵳 +儳 +豝 +傺 +搒 +縧 +硾 +䏶 +覻 +薍 +憝 +榠 +湆 +皵 +鎞 +菆 +糇 +矉 +搤 +紃 +峿 +磹 +甒 +琭 +𩥇 +菢 +禡 +渹 +刅 +迒 +敂 +蹜 +磓 +傪 +縿 +㕮 +涏 +䰀 +㡛 +韛 +犠 +餦 +圝 +焫 +㝢 +潬 +馵 +澟 +鱏 +譾 +㪍 +煼 +鍜 +窽 +紾 +堨 +䕸 +穅 +戅 +穄 +駴 +偫 +煗 +媠 +酘 +矬 +貆 +茞 +骩 +扠 +岞 +潓 +炧 +陊 +栭 +釱 +㡚 +篴 +耞 +鞉 +䋏 +𤫩 +椸 +儜 +痀 +謷 +潙 +寠 +牐 +嫭 +慅 +獧 +鈒 +欿 +薳 +蟂 +郲 +軨 +斨 +訦 +𠴲 +剺 +駪 +贙 +禫 +噣 +茢 +茙 +鄼 +揷 +魌 +䫻 +嗋 +噐 +侲 +諵 +𠺕 +挍 +䑳 +㨷 +槸 +靘 +㩧 +虣 +瑿 +衱 +襹 +餭 +㗶 +枑 +悋 +纑 +嶫 +儓 +髵 +甗 +榝 +㗭 +贗 +熸 +嬃 +礌 +偭 +樠 +栮 +鷼 +鵀 +澬 +眂 +牿 +骴 +呞 +爕 +牎 +巹 +帉 +砠 +梴 +䛏 +攃 +餁 +哿 +蹝 +崺 +閌 +醝 +臡 +麖 +駼 +賵 +夘 +骻 +愡 +俔 +諐 +觩 +莂 +饈 +殣 +溠 +冱 +埓 +厫 +虥 +芄 +慽 +竃 +埿 +仭 +褼 +倛 +韸 +牗 +幖 +禈 +穧 +蜧 +諞 +脞 +蝃 +飃 +煁 +涒 +谾 +覢 +赮 +鼘 +艗 +䶉 +鴥 +轒 +睅 +傔 +惵 +唈 +懆 +磣 +膢 +堶 +囈 +瘕 +誷 +瑘 +絝 +鬈 +嘽 +鷅 +梜 +喎 +鼟 +㟧 +劻 +眑 +剴 +痎 +餟 +庌 +菷 +梐 +吺 +躘 +慞 +罼 +穨 +摏 +釄 +莋 +呺 +砅 +鴽 +㘭 +㟅 +艴 +犉 +籕 +跐 +惏 +陗 +刋 +襘 +醹 +紽 +痌 +㗀 +撋 +陼 +駷 +艼 +踼 +癏 +慠 +趒 +邍 +姞 +䂬 +堲 +苙 +椌 +嗃 +挶 +岯 +禗 +嵔 +觡 +豜 +睩 +㒿 +塠 +燂 +扤 +恟 +鬝 +鬇 +鬡 +揳 +霠 +㗫 +苐 +蒀 +圌 +戭 +䖃 +𥈭 +勮 +耝 +轞 +胮 +墯 +枮 +罿 +浺 +綪 +爓 +蘃 +襍 +轜 +閠 +畽 +鄊 +嶆 +籭 +蠯 +陑 +瘽 +迆 +賷 +䍡 +韂 +躃 +禴 +簄 +瓟 +碐 +躨 +侜 +岍 +䃸 +趚 +髐 +榅 +粣 +屝 +鴃 +圁 +蝜 +黫 +僽 +丗 +靣 +湏 +抏 +㟽 +跙 +餤 +朙 +㹞 +瞖 +繣 +㨫 +罙 +糒 +惉 +葽 +鼮 +蕳 +豏 +𥱼 +鵮 +獦 +悕 +𠴨 +闟 +惽 +慿 +隉 +椷 +𩅰 +艛 +眽 +凓 +儃 +奨 +埀 +瑫 +駚 +濇 +緶 +峉 +礨 +髢 +瞯 +壥 +姡 +㟯 +髬 +啀 +㶿 +歅 +殀 +縩 +疈 +鳸 +霳 +稬 +圊 +彚 +裠 +埳 +褋 +㔩 +矲 +剶 +硋 +聦 +峞 +浰 +窵 +嘂 +睘 +簵 +腒 +韘 +躣 +甈 +忲 +舽 +襂 +硠 +脃 +鐏 +奯 +脧 +矕 +䠞 +駹 +豶 +訑 +柸 +鰅 +瘨 +趿 +糦 +蟏 +饛 +尰 +諑 +汃 +毺 +鋃 +絚 +馧 +艬 +枍 +爊 +峗 +泙 +碖 +鵕 +尩 +閗 +𤧚 +幩 +塉 +箊 +覂 +玒 +橧 +謟 +庨 +籔 +欑 +厎 +尭 +氉 +蠈 +䓞 +矙 +梡 +瀩 +溔 +煴 +蔲 +僬 +嵢 +梩 +弝 +𣙙 +鞟 +敉 +鮚 +湠 +鐐 +爣 +裻 +䶎 +𦨴 +謿 +垾 +蝂 +睂 +癙 +韽 +㟳 +桒 +鳿 +樏 +峛 +瑉 +僄 +顣 +衺 +殗 +肦 +圑 +朒 +喌 +犦 +㰅 +疁 +氃 +吰 +陻 +盰 +娀 +魶 +㖃 +曒 +娿 +獱 +孏 +酅 +蝡 +齰 +莬 +鄀 +逥 +挿 +觵 +縆 +㟝 +繍 +碙 +㑂 +䎳 +兾 +壸 +賝 +桯 +跁 +跒 +蔍 +舼 +忀 +懭 +媌 +罭 +菵 +狔 +靿 +拪 +㲉 +䔲 +嬀 +鵽 +涳 +朾 +𡸣 +𢫫 +虈 +㜮 +顑 +櫋 +蔪 +旝 +湡 +蹛 +稆 +唽 +㟏 +熂 +龡 +煟 +韅 +韐 +慂 +剳 +掫 +兠 +摋 +羫 +璊 +鵻 +駓 +佌 +蜹 +晲 +矒 +玅 +剰 +斶 +紖 +懴 +駜 +羢 +麳 +㳷 +馞 +爥 +鍚 +鑢 +螵 +嗺 +鏨 +𠙶 +疪 +鷔 +鮧 +轊 +栘 +鼜 +睗 +蟘 +枓 +䖟 +剠 +瞤 +圛 +椳 +籸 +䪌 +鯹 +湌 +丳 +賧 +縭 +檾 +𦨻 +撆 +䩫 +磢 +惥 +譀 +罤 +鞸 +鉎 +㶏 +膁 +甋 +瓀 +懹 +槢 +硊 +弆 +琫 +嵠 +駻 +湢 +杮 +䌨 +訹 +藇 +穯 +蠉 +曭 +蹎 +詄 +毷 +𩃎 +熁 +灜 +蜫 +蜳 +昈 +帩 +鈋 +䐹 +顖 +鄹 +匶 +毾 +礜 +堭 +婞 +鷿 +㙞 +詀 +瘮 +䫜 +㾪 +捘 +屫 +誧 +䲔 +閍 +蒳 +㬋 +遟 +嶀 +葐 +蜼 +㻱 +曡 +䃜 +濴 +䦱 +霫 +譆 +霋 +蕰 +襓 +氋 +鴷 +魦 +㩻 +㡠 +灉 +贑 +燑 +峝 +輷 +烻 +耼 +螉 +跜 +豩 +㑃 +藙 +鋂 +胐 +𣔻 +紒 +瓓 +塯 +辴 +趷 +堛 +㒟 +㗲 +㬊 +䄡 +卄 +姧 +猓 +躗 +覤 +醊 +兎 +罯 +痯 +覸 +詉 +癿 +岋 +歝 +茟 +㘆 +㮰 +淜 +𥉌 +㫰 +鈌 +毵 +狉 +贜 +峬 +汻 +誖 +烓 +睋 +潎 +䲺 +㠓 +歖 +𠜱 +槵 +熚 +萷 +磤 +絸 +鷞 +聻 +屷 +㝵 +諕 +瘂 +㺷 +蚰 +柦 +䍐 +泿 +礰 +摎 +㜕 +㻞 +洓 +喍 +囌 +囐 +䙱 +腨 +妉 +鄛 +鄥 +㵝 +輧 +鱄 +騟 +鈚 +廜 +𨗨 +㶼 +膞 +崯 +硞 +萆 +眒 +譩 +揬 +藑 +匌 +㠾 +㥏 +㢮 +䕢 +帣 +酭 +枦 +孅 +鞙 +丷 +鍭 +䤴 +餂 +愗 +冘 +埛 +㒇 +郕 +蔯 +簰 +刔 +蠩 +耏 +鞹 +𧑅 +觹 +䐑 +磶 +蹵 +鵃 +耛 +蓤 +臄 +轙 +庤 +㒩 +翐 +榥 +晀 +輣 +蟚 +拲 +皠 +穱 +䃔 +䃧 +窡 +絍 +礿 +鑞 +栯 +㾓 +掿 +厞 +淂 +撶 +伹 +鹻 +軓 +岹 +蚷 +榸 +刾 +艂 +㤝 +塕 +蚔 +藾 +攓 +鏬 +珫 +黪 +蟧 +猭 +漑 +粺 +驆 +撘 +亾 +㼌 +蝑 +澓 +揞 +欱 +愶 +泲 +醷 +螴 +芚 +絻 +轃 +漮 +唪 +岉 +鬀 +䱹 +齖 +䂓 +趢 +荓 +覶 +鯾 +諿 +槥 +嚆 +爢 +瓬 +笐 +篢 +舝 +襵 +鎒 +𤝞 +肭 +瘇 +笓 +餑 +豋 +湗 +緎 +肐 +胲 +掤 +潫 +䖴 +𠎝 +𨺗 +諢 +毈 +寱 +唲 +䃭 +峮 +狘 +韊 +䬝 +呰 +㹱 +碞 +畞 +㠌 +黭 +蚘 +豵 +穥 +尯 +㳇 +隵 +灇 +壜 +楰 +彲 +甤 +綹 +旞 +𡏟 +曁 +喩 +𥲤 +郈 +塺 +訧 +絿 +掔 +蠮 +𡱰 +䃺 +宻 +灎 +羵 +𨠵 +糚 +摉 +壷 +勴 +瑃 +鎝 +𥜥 +婥 +鬺 +扢 +肣 +溰 +磩 +耇 +宎 +㔇 +霱 +敚 +汳 +鏄 +儹 +隥 +㿉 +膆 +崏 +𦭵 +郔 +扂 +垗 +㳂 +礛 +缻 +垜 +晱 +訩 +蘪 +珇 +怮 +垝 +㔢 +憛 +痝 +蟨 +鞁 +鶤 +肎 +傝 +䢆 +䰄 +𥊚 +㖀 +㠭 +壵 +墋 +㠔 +橜 +怓 +蚹 +塛 +憪 +鋝 +腶 +嶾 +翍 +溓 +齼 +蔂 +䃂 +鉺 +攑 +瓐 +泎 +眤 +邘 +崝 +稡 +愸 +髥 +輹 +詨 +髆 +麃 +虤 +洐 +婐 +挏 +峑 +嶣 +篬 +葄 +瑎 +瓉 +㳅 +葼 +姙 +䪜 +𩇕 +焭 +剚 +濪 +霵 +僒 + +羭 diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..567898b49de2707853454682f05e0c504c0085b9 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_dict.txt @@ -0,0 +1,18383 @@ +  +一 +乙 +二 +十 +丁 +厂 +七 +卜 +八 +人 +入 +儿 +匕 +几 +九 +刁 +了 +刀 +力 +乃 +又 +三 +干 +于 +亏 +工 +土 +士 +才 +下 +寸 +大 +丈 +与 +万 +上 +小 +口 +山 +巾 +千 +乞 +川 +亿 +个 +夕 +久 +么 +勺 +凡 +丸 +及 +广 +亡 +门 +丫 +义 +之 +尸 +己 +已 +巳 +弓 +子 +卫 +也 +女 +刃 +飞 +习 +叉 +马 +乡 +丰 +王 +开 +井 +天 +夫 +元 +无 +云 +专 +丐 +扎 +艺 +木 +五 +支 +厅 +不 +犬 +太 +区 +历 +歹 +友 +尤 +匹 +车 +巨 +牙 +屯 +戈 +比 +互 +切 +瓦 +止 +少 +曰 +日 +中 +贝 +冈 +内 +水 +见 +午 +牛 +手 +气 +毛 +壬 +升 +夭 +长 +仁 +什 +片 +仆 +化 +仇 +币 +仍 +仅 +斤 +爪 +反 +介 +父 +从 +仑 +今 +凶 +分 +乏 +公 +仓 +月 +氏 +勿 +欠 +风 +丹 +匀 +乌 +勾 +凤 +六 +文 +亢 +方 +火 +为 +斗 +忆 +计 +订 +户 +认 +冗 +讥 +心 +尺 +引 +丑 +巴 +孔 +队 +办 +以 +允 +予 +邓 +劝 +双 +书 +幻 +玉 +刊 +未 +末 +示 +击 +打 +巧 +正 +扑 +卉 +扒 +功 +扔 +去 +甘 +世 +艾 +古 +节 +本 +术 +可 +丙 +左 +厉 +石 +右 +布 +夯 +戊 +龙 +平 +灭 +轧 +东 +卡 +北 +占 +凸 +卢 +业 +旧 +帅 +归 +旦 +目 +且 +叶 +甲 +申 +叮 +电 +号 +田 +由 +只 +叭 +史 +央 +兄 +叽 +叼 +叫 +叩 +叨 +另 +叹 +冉 +皿 +凹 +囚 +四 +生 +矢 +失 +乍 +禾 +丘 +付 +仗 +代 +仙 +们 +仪 +白 +仔 +他 +斥 +瓜 +乎 +丛 +令 +用 +甩 +印 +尔 +乐 +句 +匆 +册 +卯 +犯 +外 +处 +冬 +鸟 +务 +包 +饥 +主 +市 +立 +冯 +玄 +闪 +兰 +半 +汁 +汇 +头 +汉 +宁 +穴 +它 +讨 +写 +让 +礼 +训 +议 +必 +讯 +记 +永 +司 +尼 +民 +弗 +弘 +出 +辽 +奶 +奴 +召 +加 +皮 +边 +孕 +发 +圣 +对 +台 +矛 +纠 +母 +幼 +丝 +邦 +式 +迂 +刑 +戎 +动 +扛 +寺 +吉 +扣 +考 +托 +老 +巩 +圾 +执 +扩 +扫 +地 +场 +扬 +耳 +芋 +共 +芒 +亚 +芝 +朽 +朴 +机 +权 +过 +臣 +吏 +再 +协 +西 +压 +厌 +戌 +在 +百 +有 +存 +而 +页 +匠 +夸 +夺 +灰 +达 +列 +死 +成 +夹 +夷 +轨 +邪 +尧 +划 +迈 +毕 +至 +此 +贞 +师 +尘 +尖 +劣 +光 +当 +早 +吁 +吐 +吓 +虫 +曲 +团 +吕 +同 +吊 +吃 +因 +吸 +吗 +吆 +屿 +屹 +岁 +帆 +回 +岂 +则 +刚 +网 +肉 +年 +朱 +先 +丢 +廷 +舌 +竹 +迁 +乔 +迄 +伟 +传 +乒 +乓 +休 +伍 +伏 +优 +臼 +伐 +延 +仲 +件 +任 +伤 +价 +伦 +份 +华 +仰 +仿 +伙 +伪 +自 +伊 +血 +向 +似 +后 +行 +舟 +全 +会 +杀 +合 +兆 +企 +众 +爷 +伞 +创 +肌 +肋 +朵 +杂 +危 +旬 +旨 +旭 +负 +匈 +名 +各 +多 +争 +色 +壮 +冲 +妆 +冰 +庄 +庆 +亦 +刘 +齐 +交 +衣 +次 +产 +决 +亥 +充 +妄 +闭 +问 +闯 +羊 +并 +关 +米 +灯 +州 +汗 +污 +江 +汛 +池 +汝 +汤 +忙 +兴 +宇 +守 +宅 +字 +安 +讲 +讳 +军 +讶 +许 +讹 +论 +讼 +农 +讽 +设 +访 +诀 +寻 +那 +迅 +尽 +导 +异 +弛 +孙 +阵 +阳 +收 +阶 +阴 +防 +奸 +如 +妇 +妃 +好 +她 +妈 +戏 +羽 +观 +欢 +买 +红 +驮 +纤 +驯 +约 +级 +纪 +驰 +纫 +巡 +寿 +弄 +麦 +玖 +玛 +形 +进 +戒 +吞 +远 +违 +韧 +运 +扶 +抚 +坛 +技 +坏 +抠 +扰 +扼 +拒 +找 +批 +址 +扯 +走 +抄 +贡 +汞 +坝 +攻 +赤 +折 +抓 +扳 +抡 +扮 +抢 +孝 +坎 +均 +抑 +抛 +投 +坟 +坑 +抗 +坊 +抖 +护 +壳 +志 +块 +扭 +声 +把 +报 +拟 +却 +抒 +劫 +芙 +芜 +苇 +芽 +花 +芹 +芥 +芬 +苍 +芳 +严 +芦 +芯 +劳 +克 +芭 +苏 +杆 +杠 +杜 +材 +村 +杖 +杏 +杉 +巫 +极 +李 +杨 +求 +甫 +匣 +更 +束 +吾 +豆 +两 +酉 +丽 +医 +辰 +励 +否 +还 +尬 +歼 +来 +连 +轩 +步 +卤 +坚 +肖 +旱 +盯 +呈 +时 +吴 +助 +县 +里 +呆 +吱 +吠 +呕 +园 +旷 +围 +呀 +吨 +足 +邮 +男 +困 +吵 +串 +员 +呐 +听 +吟 +吩 +呛 +吻 +吹 +呜 +吭 +吧 +邑 +吼 +囤 +别 +吮 +岖 +岗 +帐 +财 +针 +钉 +牡 +告 +我 +乱 +利 +秃 +秀 +私 +每 +兵 +估 +体 +何 +佐 +佑 +但 +伸 +佃 +作 +伯 +伶 +佣 +低 +你 +住 +位 +伴 +身 +皂 +伺 +佛 +囱 +近 +彻 +役 +返 +余 +希 +坐 +谷 +妥 +含 +邻 +岔 +肝 +肛 +肚 +肘 +肠 +龟 +甸 +免 +狂 +犹 +狈 +角 +删 +条 +彤 +卵 +灸 +岛 +刨 +迎 +饭 +饮 +系 +言 +冻 +状 +亩 +况 +床 +库 +庇 +疗 +吝 +应 +这 +冷 +庐 +序 +辛 +弃 +冶 +忘 +闰 +闲 +间 +闷 +判 +兑 +灶 +灿 +灼 +弟 +汪 +沐 +沛 +汰 +沥 +沙 +汽 +沃 +沦 +汹 +泛 +沧 +没 +沟 +沪 +沈 +沉 +沁 +怀 +忧 +忱 +快 +完 +宋 +宏 +牢 +究 +穷 +灾 +良 +证 +启 +评 +补 +初 +社 +祀 +识 +诈 +诉 +罕 +诊 +词 +译 +君 +灵 +即 +层 +屁 +尿 +尾 +迟 +局 +改 +张 +忌 +际 +陆 +阿 +陈 +阻 +附 +坠 +妓 +妙 +妖 +姊 +妨 +妒 +努 +忍 +劲 +矣 +鸡 +纬 +驱 +纯 +纱 +纲 +纳 +驳 +纵 +纷 +纸 +纹 +纺 +驴 +纽 +奉 +玩 +环 +武 +青 +责 +现 +玫 +表 +规 +抹 +卦 +坷 +坯 +拓 +拢 +拔 +坪 +拣 +坦 +担 +坤 +押 +抽 +拐 +拖 +者 +拍 +顶 +拆 +拎 +拥 +抵 +拘 +势 +抱 +拄 +垃 +拉 +拦 +幸 +拌 +拧 +拂 +拙 +招 +坡 +披 +拨 +择 +抬 +拇 +拗 +其 +取 +茉 +苦 +昔 +苛 +若 +茂 +苹 +苗 +英 +苟 +苑 +苞 +范 +直 +茁 +茄 +茎 +苔 +茅 +枉 +林 +枝 +杯 +枢 +柜 +枚 +析 +板 +松 +枪 +枫 +构 +杭 +杰 +述 +枕 +丧 +或 +画 +卧 +事 +刺 +枣 +雨 +卖 +郁 +矾 +矿 +码 +厕 +奈 +奔 +奇 +奋 +态 +欧 +殴 +垄 +妻 +轰 +顷 +转 +斩 +轮 +软 +到 +非 +叔 +歧 +肯 +齿 +些 +卓 +虎 +虏 +肾 +贤 +尚 +旺 +具 +味 +果 +昆 +国 +哎 +咕 +昌 +呵 +畅 +明 +易 +咙 +昂 +迪 +典 +固 +忠 +呻 +咒 +咋 +咐 +呼 +鸣 +咏 +呢 +咄 +咖 +岸 +岩 +帖 +罗 +帜 +帕 +岭 +凯 +败 +账 +贩 +贬 +购 +贮 +图 +钓 +制 +知 +迭 +氛 +垂 +牧 +物 +乖 +刮 +秆 +和 +季 +委 +秉 +佳 +侍 +岳 +供 +使 +例 +侠 +侥 +版 +侄 +侦 +侣 +侧 +凭 +侨 +佩 +货 +侈 +依 +卑 +的 +迫 +质 +欣 +征 +往 +爬 +彼 +径 +所 +舍 +金 +刹 +命 +肴 +斧 +爸 +采 +觅 +受 +乳 +贪 +念 +贫 +忿 +肤 +肺 +肢 +肿 +胀 +朋 +股 +肮 +肪 +肥 +服 +胁 +周 +昏 +鱼 +兔 +狐 +忽 +狗 +狞 +备 +饰 +饱 +饲 +变 +京 +享 +庞 +店 +夜 +庙 +府 +底 +疟 +疙 +疚 +剂 +卒 +郊 +庚 +废 +净 +盲 +放 +刻 +育 +氓 +闸 +闹 +郑 +券 +卷 +单 +炬 +炒 +炊 +炕 +炎 +炉 +沫 +浅 +法 +泄 +沽 +河 +沾 +泪 +沮 +油 +泊 +沿 +泡 +注 +泣 +泞 +泻 +泌 +泳 +泥 +沸 +沼 +波 +泼 +泽 +治 +怔 +怯 +怖 +性 +怕 +怜 +怪 +怡 +学 +宝 +宗 +定 +宠 +宜 +审 +宙 +官 +空 +帘 +宛 +实 +试 +郎 +诗 +肩 +房 +诚 +衬 +衫 +视 +祈 +话 +诞 +诡 +询 +该 +详 +建 +肃 +录 +隶 +帚 +屉 +居 +届 +刷 +屈 +弧 +弥 +弦 +承 +孟 +陋 +陌 +孤 +陕 +降 +函 +限 +妹 +姑 +姐 +姓 +妮 +始 +姆 +迢 +驾 +叁 +参 +艰 +线 +练 +组 +绅 +细 +驶 +织 +驹 +终 +驻 +绊 +驼 +绍 +绎 +经 +贯 +契 +贰 +奏 +春 +帮 +玷 +珍 +玲 +玻 +毒 +型 +拭 +挂 +封 +持 +拷 +拱 +项 +垮 +挎 +城 +挟 +挠 +政 +赴 +赵 +挡 +拽 +哉 +挺 +括 +垢 +拴 +拾 +挑 +垛 +指 +垫 +挣 +挤 +拼 +挖 +按 +挥 +挪 +拯 +某 +甚 +荆 +茸 +革 +茬 +荐 +巷 +带 +草 +茧 +茵 +茶 +荒 +茫 +荡 +荣 +荤 +荧 +故 +胡 +荫 +荔 +南 +药 +标 +栈 +柑 +枯 +柄 +栋 +相 +查 +柏 +栅 +柳 +柱 +柿 +栏 +柠 +树 +勃 +要 +柬 +咸 +威 +歪 +研 +砖 +厘 +厚 +砌 +砂 +泵 +砚 +砍 +面 +耐 +耍 +牵 +鸥 +残 +殃 +轴 +轻 +鸦 +皆 +韭 +背 +战 +点 +虐 +临 +览 +竖 +省 +削 +尝 +昧 +盹 +是 +盼 +眨 +哇 +哄 +哑 +显 +冒 +映 +星 +昨 +咧 +昭 +畏 +趴 +胃 +贵 +界 +虹 +虾 +蚁 +思 +蚂 +虽 +品 +咽 +骂 +勋 +哗 +咱 +响 +哈 +哆 +咬 +咳 +咪 +哪 +哟 +炭 +峡 +罚 +贱 +贴 +贻 +骨 +幽 +钙 +钝 +钞 +钟 +钢 +钠 +钥 +钦 +钧 +钩 +钮 +卸 +缸 +拜 +看 +矩 +毡 +氢 +怎 +牲 +选 +适 +秒 +香 +种 +秋 +科 +重 +复 +竿 +段 +便 +俩 +贷 +顺 +修 +俏 +保 +促 +俄 +俐 +侮 +俭 +俗 +俘 +信 +皇 +泉 +鬼 +侵 +禹 +侯 +追 +俊 +盾 +待 +徊 +衍 +律 +很 +须 +叙 +剑 +逃 +食 +盆 +胚 +胧 +胆 +胜 +胞 +胖 +脉 +胎 +勉 +狭 +狮 +独 +狰 +狡 +狱 +狠 +贸 +怨 +急 +饵 +饶 +蚀 +饺 +饼 +峦 +弯 +将 +奖 +哀 +亭 +亮 +度 +迹 +庭 +疮 +疯 +疫 +疤 +咨 +姿 +亲 +音 +帝 +施 +闺 +闻 +闽 +阀 +阁 +差 +养 +美 +姜 +叛 +送 +类 +迷 +籽 +娄 +前 +首 +逆 +兹 +总 +炼 +炸 +烁 +炮 +炫 +烂 +剃 +洼 +洁 +洪 +洒 +柒 +浇 +浊 +洞 +测 +洗 +活 +派 +洽 +染 +洛 +浏 +济 +洋 +洲 +浑 +浓 +津 +恃 +恒 +恢 +恍 +恬 +恤 +恰 +恼 +恨 +举 +觉 +宣 +宦 +室 +宫 +宪 +突 +穿 +窃 +客 +诫 +冠 +诬 +语 +扁 +袄 +祖 +神 +祝 +祠 +误 +诱 +诲 +说 +诵 +垦 +退 +既 +屋 +昼 +屏 +屎 +费 +陡 +逊 +眉 +孩 +陨 +除 +险 +院 +娃 +姥 +姨 +姻 +娇 +姚 +娜 +怒 +架 +贺 +盈 +勇 +怠 +癸 +蚤 +柔 +垒 +绑 +绒 +结 +绕 +骄 +绘 +给 +绚 +骆 +络 +绝 +绞 +骇 +统 +耕 +耘 +耗 +耙 +艳 +泰 +秦 +珠 +班 +素 +匿 +蚕 +顽 +盏 +匪 +捞 +栽 +捕 +埂 +捂 +振 +载 +赶 +起 +盐 +捎 +捍 +捏 +埋 +捉 +捆 +捐 +损 +袁 +捌 +都 +哲 +逝 +捡 +挫 +换 +挽 +挚 +热 +恐 +捣 +壶 +捅 +埃 +挨 +耻 +耿 +耽 +聂 +恭 +莽 +莱 +莲 +莫 +莉 +荷 +获 +晋 +恶 +莹 +莺 +真 +框 +梆 +桂 +桔 +栖 +档 +桐 +株 +桥 +桦 +栓 +桃 +格 +桩 +校 +核 +样 +根 +索 +哥 +速 +逗 +栗 +贾 +酌 +配 +翅 +辱 +唇 +夏 +砸 +砰 +砾 +础 +破 +原 +套 +逐 +烈 +殊 +殉 +顾 +轿 +较 +顿 +毙 +致 +柴 +桌 +虑 +监 +紧 +党 +逞 +晒 +眠 +晓 +哮 +唠 +鸭 +晃 +哺 +晌 +剔 +晕 +蚌 +畔 +蚣 +蚊 +蚪 +蚓 +哨 +哩 +圃 +哭 +哦 +恩 +鸯 +唤 +唁 +哼 +唧 +啊 +唉 +唆 +罢 +峭 +峨 +峰 +圆 +峻 +贼 +贿 +赂 +赃 +钱 +钳 +钻 +钾 +铁 +铃 +铅 +缺 +氧 +氨 +特 +牺 +造 +乘 +敌 +秤 +租 +积 +秧 +秩 +称 +秘 +透 +笔 +笑 +笋 +债 +借 +值 +倚 +俺 +倾 +倒 +倘 +俱 +倡 +候 +赁 +俯 +倍 +倦 +健 +臭 +射 +躬 +息 +倔 +徒 +徐 +殷 +舰 +舱 +般 +航 +途 +拿 +耸 +爹 +舀 +爱 +豺 +豹 +颁 +颂 +翁 +胰 +脆 +脂 +胸 +胳 +脏 +脐 +胶 +脑 +脓 +逛 +狸 +狼 +卿 +逢 +鸵 +留 +鸳 +皱 +饿 +馁 +凌 +凄 +恋 +桨 +浆 +衰 +衷 +高 +郭 +席 +准 +座 +症 +病 +疾 +斋 +疹 +疼 +疲 +脊 +效 +离 +紊 +唐 +瓷 +资 +凉 +站 +剖 +竞 +部 +旁 +旅 +畜 +阅 +羞 +羔 +瓶 +拳 +粉 +料 +益 +兼 +烤 +烘 +烦 +烧 +烛 +烟 +烙 +递 +涛 +浙 +涝 +浦 +酒 +涉 +消 +涡 +浩 +海 +涂 +浴 +浮 +涣 +涤 +流 +润 +涧 +涕 +浪 +浸 +涨 +烫 +涩 +涌 +悖 +悟 +悄 +悍 +悔 +悯 +悦 +害 +宽 +家 +宵 +宴 +宾 +窍 +窄 +容 +宰 +案 +请 +朗 +诸 +诺 +读 +扇 +诽 +袜 +袖 +袍 +被 +祥 +课 +冥 +谁 +调 +冤 +谅 +谆 +谈 +谊 +剥 +恳 +展 +剧 +屑 +弱 +陵 +祟 +陶 +陷 +陪 +娱 +娟 +恕 +娥 +娘 +通 +能 +难 +预 +桑 +绢 +绣 +验 +继 +骏 +球 +琐 +理 +琉 +琅 +捧 +堵 +措 +描 +域 +捺 +掩 +捷 +排 +焉 +掉 +捶 +赦 +堆 +推 +埠 +掀 +授 +捻 +教 +掏 +掐 +掠 +掂 +培 +接 +掷 +控 +探 +据 +掘 +掺 +职 +基 +聆 +勘 +聊 +娶 +著 +菱 +勒 +黄 +菲 +萌 +萝 +菌 +萎 +菜 +萄 +菊 +菩 +萍 +菠 +萤 +营 +乾 +萧 +萨 +菇 +械 +彬 +梦 +婪 +梗 +梧 +梢 +梅 +检 +梳 +梯 +桶 +梭 +救 +曹 +副 +票 +酝 +酗 +厢 +戚 +硅 +硕 +奢 +盔 +爽 +聋 +袭 +盛 +匾 +雪 +辅 +辆 +颅 +虚 +彪 +雀 +堂 +常 +眶 +匙 +晨 +睁 +眯 +眼 +悬 +野 +啪 +啦 +曼 +晦 +晚 +啄 +啡 +距 +趾 +啃 +跃 +略 +蚯 +蛀 +蛇 +唬 +累 +鄂 +唱 +患 +啰 +唾 +唯 +啤 +啥 +啸 +崖 +崎 +崭 +逻 +崔 +帷 +崩 +崇 +崛 +婴 +圈 +铐 +铛 +铝 +铜 +铭 +铲 +银 +矫 +甜 +秸 +梨 +犁 +秽 +移 +笨 +笼 +笛 +笙 +符 +第 +敏 +做 +袋 +悠 +偿 +偶 +偎 +偷 +您 +售 +停 +偏 +躯 +兜 +假 +衅 +徘 +徙 +得 +衔 +盘 +舶 +船 +舵 +斜 +盒 +鸽 +敛 +悉 +欲 +彩 +领 +脚 +脖 +脯 +豚 +脸 +脱 +象 +够 +逸 +猜 +猪 +猎 +猫 +凰 +猖 +猛 +祭 +馅 +馆 +凑 +减 +毫 +烹 +庶 +麻 +庵 +痊 +痒 +痕 +廊 +康 +庸 +鹿 +盗 +章 +竟 +商 +族 +旋 +望 +率 +阎 +阐 +着 +羚 +盖 +眷 +粘 +粗 +粒 +断 +剪 +兽 +焊 +焕 +清 +添 +鸿 +淋 +涯 +淹 +渠 +渐 +淑 +淌 +混 +淮 +淆 +渊 +淫 +渔 +淘 +淳 +液 +淤 +淡 +淀 +深 +涮 +涵 +婆 +梁 +渗 +情 +惜 +惭 +悼 +惧 +惕 +惟 +惊 +惦 +悴 +惋 +惨 +惯 +寇 +寅 +寄 +寂 +宿 +窒 +窑 +密 +谋 +谍 +谎 +谐 +袱 +祷 +祸 +谓 +谚 +谜 +逮 +敢 +尉 +屠 +弹 +隋 +堕 +随 +蛋 +隅 +隆 +隐 +婚 +婶 +婉 +颇 +颈 +绩 +绪 +续 +骑 +绰 +绳 +维 +绵 +绷 +绸 +综 +绽 +绿 +缀 +巢 +琴 +琳 +琢 +琼 +斑 +替 +揍 +款 +堪 +塔 +搭 +堰 +揩 +越 +趁 +趋 +超 +揽 +堤 +提 +博 +揭 +喜 +彭 +揣 +插 +揪 +搜 +煮 +援 +搀 +裁 +搁 +搓 +搂 +搅 +壹 +握 +搔 +揉 +斯 +期 +欺 +联 +葫 +散 +惹 +葬 +募 +葛 +董 +葡 +敬 +葱 +蒋 +蒂 +落 +韩 +朝 +辜 +葵 +棒 +棱 +棋 +椰 +植 +森 +焚 +椅 +椒 +棵 +棍 +椎 +棉 +棚 +棕 +棺 +榔 +椭 +惠 +惑 +逼 +粟 +棘 +酣 +酥 +厨 +厦 +硬 +硝 +确 +硫 +雁 +殖 +裂 +雄 +颊 +雳 +暂 +雅 +翘 +辈 +悲 +紫 +凿 +辉 +敞 +棠 +赏 +掌 +晴 +睐 +暑 +最 +晰 +量 +鼎 +喷 +喳 +晶 +喇 +遇 +喊 +遏 +晾 +景 +畴 +践 +跋 +跌 +跑 +跛 +遗 +蛙 +蛛 +蜓 +蜒 +蛤 +喝 +鹃 +喂 +喘 +喉 +喻 +啼 +喧 +嵌 +幅 +帽 +赋 +赌 +赎 +赐 +赔 +黑 +铸 +铺 +链 +销 +锁 +锄 +锅 +锈 +锋 +锌 +锐 +甥 +掰 +短 +智 +氮 +毯 +氯 +鹅 +剩 +稍 +程 +稀 +税 +筐 +等 +筑 +策 +筛 +筒 +筏 +答 +筋 +筝 +傲 +傅 +牌 +堡 +集 +焦 +傍 +储 +皓 +皖 +粤 +奥 +街 +惩 +御 +循 +艇 +舒 +逾 +番 +释 +禽 +腊 +脾 +腋 +腔 +腕 +鲁 +猩 +猬 +猾 +猴 +惫 +然 +馈 +馋 +装 +蛮 +就 +敦 +斌 +痘 +痢 +痪 +痛 +童 +竣 +阔 +善 +翔 +羡 +普 +粪 +尊 +奠 +道 +遂 +曾 +焰 +港 +滞 +湖 +湘 +渣 +渤 +渺 +湿 +温 +渴 +溃 +溅 +滑 +湃 +渝 +湾 +渡 +游 +滋 +渲 +溉 +愤 +慌 +惰 +愕 +愣 +惶 +愧 +愉 +慨 +割 +寒 +富 +寓 +窜 +窝 +窖 +窗 +窘 +遍 +雇 +裕 +裤 +裙 +禅 +禄 +谢 +谣 +谤 +谦 +犀 +属 +屡 +强 +粥 +疏 +隔 +隙 +隘 +媒 +絮 +嫂 +媚 +婿 +登 +缅 +缆 +缉 +缎 +缓 +缔 +缕 +骗 +编 +骚 +缘 +瑟 +鹉 +瑞 +瑰 +瑙 +魂 +肆 +摄 +摸 +填 +搏 +塌 +鼓 +摆 +携 +搬 +摇 +搞 +塘 +摊 +聘 +斟 +蒜 +勤 +靴 +靶 +鹊 +蓝 +墓 +幕 +蓬 +蓄 +蒲 +蓉 +蒙 +蒸 +献 +椿 +禁 +楚 +楷 +榄 +想 +槐 +榆 +楼 +概 +赖 +酪 +酬 +感 +碍 +碘 +碑 +碎 +碰 +碗 +碌 +尴 +雷 +零 +雾 +雹 +辐 +辑 +输 +督 +频 +龄 +鉴 +睛 +睹 +睦 +瞄 +睫 +睡 +睬 +嗜 +鄙 +嗦 +愚 +暖 +盟 +歇 +暗 +暇 +照 +畸 +跨 +跷 +跳 +跺 +跪 +路 +跤 +跟 +遣 +蜈 +蜗 +蛾 +蜂 +蜕 +嗅 +嗡 +嗓 +署 +置 +罪 +罩 +蜀 +幌 +错 +锚 +锡 +锣 +锤 +锥 +锦 +键 +锯 +锰 +矮 +辞 +稚 +稠 +颓 +愁 +筹 +签 +简 +筷 +毁 +舅 +鼠 +催 +傻 +像 +躲 +魁 +衙 +微 +愈 +遥 +腻 +腰 +腥 +腮 +腹 +腺 +鹏 +腾 +腿 +鲍 +猿 +颖 +触 +解 +煞 +雏 +馍 +馏 +酱 +禀 +痹 +廓 +痴 +痰 +廉 +靖 +新 +韵 +意 +誊 +粮 +数 +煎 +塑 +慈 +煤 +煌 +满 +漠 +滇 +源 +滤 +滥 +滔 +溪 +溜 +漓 +滚 +溢 +溯 +滨 +溶 +溺 +粱 +滩 +慎 +誉 +塞 +寞 +窥 +窟 +寝 +谨 +褂 +裸 +福 +谬 +群 +殿 +辟 +障 +媳 +嫉 +嫌 +嫁 +叠 +缚 +缝 +缠 +缤 +剿 +静 +碧 +璃 +赘 +熬 +墙 +墟 +嘉 +摧 +赫 +截 +誓 +境 +摘 +摔 +撇 +聚 +慕 +暮 +摹 +蔓 +蔑 +蔡 +蔗 +蔽 +蔼 +熙 +蔚 +兢 +模 +槛 +榴 +榜 +榨 +榕 +歌 +遭 +酵 +酷 +酿 +酸 +碟 +碱 +碳 +磁 +愿 +需 +辖 +辗 +雌 +裳 +颗 +瞅 +墅 +嗽 +踊 +蜻 +蜡 +蝇 +蜘 +蝉 +嘛 +嘀 +赚 +锹 +锻 +镀 +舞 +舔 +稳 +熏 +箕 +算 +箩 +管 +箫 +舆 +僚 +僧 +鼻 +魄 +魅 +貌 +膜 +膊 +膀 +鲜 +疑 +孵 +馒 +裹 +敲 +豪 +膏 +遮 +腐 +瘩 +瘟 +瘦 +辣 +彰 +竭 +端 +旗 +精 +粹 +歉 +弊 +熄 +熔 +煽 +潇 +漆 +漱 +漂 +漫 +滴 +漾 +演 +漏 +慢 +慷 +寨 +赛 +寡 +察 +蜜 +寥 +谭 +肇 +褐 +褪 +谱 +隧 +嫩 +翠 +熊 +凳 +骡 +缩 +慧 +撵 +撕 +撒 +撩 +趣 +趟 +撑 +撮 +撬 +播 +擒 +墩 +撞 +撤 +增 +撰 +聪 +鞋 +鞍 +蕉 +蕊 +蔬 +蕴 +横 +槽 +樱 +橡 +樟 +橄 +敷 +豌 +飘 +醋 +醇 +醉 +磕 +磊 +磅 +碾 +震 +霄 +霉 +瞒 +题 +暴 +瞎 +嘻 +嘶 +嘲 +嘹 +影 +踢 +踏 +踩 +踪 +蝶 +蝴 +蝠 +蝎 +蝌 +蝗 +蝙 +嘿 +嘱 +幢 +墨 +镇 +镐 +镑 +靠 +稽 +稻 +黎 +稿 +稼 +箱 +篓 +箭 +篇 +僵 +躺 +僻 +德 +艘 +膝 +膛 +鲤 +鲫 +熟 +摩 +褒 +瘪 +瘤 +瘫 +凛 +颜 +毅 +糊 +遵 +憋 +潜 +澎 +潮 +潭 +鲨 +澳 +潘 +澈 +澜 +澄 +懂 +憔 +懊 +憎 +额 +翩 +褥 +谴 +鹤 +憨 +慰 +劈 +履 +豫 +缭 +撼 +擂 +操 +擅 +燕 +蕾 +薯 +薛 +薇 +擎 +薪 +薄 +颠 +翰 +噩 +橱 +橙 +橘 +整 +融 +瓢 +醒 +霍 +霎 +辙 +冀 +餐 +嘴 +踱 +蹄 +蹂 +蟆 +螃 +器 +噪 +鹦 +赠 +默 +黔 +镜 +赞 +穆 +篮 +篡 +篷 +篱 +儒 +邀 +衡 +膨 +雕 +鲸 +磨 +瘾 +瘸 +凝 +辨 +辩 +糙 +糖 +糕 +燃 +濒 +澡 +激 +懒 +憾 +懈 +窿 +壁 +避 +缰 +缴 +戴 +擦 +藉 +鞠 +藏 +藐 +檬 +檐 +檀 +礁 +磷 +霜 +霞 +瞭 +瞧 +瞬 +瞳 +瞩 +瞪 +曙 +蹋 +蹈 +螺 +蟋 +蟀 +嚎 +赡 +穗 +魏 +簧 +簇 +繁 +徽 +爵 +朦 +臊 +鳄 +癌 +辫 +赢 +糟 +糠 +燥 +懦 +豁 +臀 +臂 +翼 +骤 +藕 +鞭 +藤 +覆 +瞻 +蹦 +嚣 +镰 +翻 +鳍 +鹰 +瀑 +襟 +璧 +戳 +孽 +警 +蘑 +藻 +攀 +曝 +蹲 +蹭 +蹬 +巅 +簸 +簿 +蟹 +颤 +靡 +癣 +瓣 +羹 +鳖 +爆 +疆 +鬓 +壤 +馨 +耀 +躁 +蠕 +嚼 +嚷 +巍 +籍 +鳞 +魔 +糯 +灌 +譬 +蠢 +霸 +露 +霹 +躏 +黯 +髓 +赣 +囊 +镶 +瓤 +罐 +矗 +乂 +乜 +兀 +弋 +孑 +孓 +幺 +亓 +韦 +廿 +丏 +卅 +仄 +厄 +仃 +仉 +仂 +兮 +刈 +爻 +卞 +闩 +讣 +尹 +夬 +爿 +毋 +邗 +邛 +艽 +艿 +札 +叵 +匝 +丕 +匜 +劢 +卟 +叱 +叻 +仨 +仕 +仟 +仡 +仫 +仞 +卮 +氐 +犰 +刍 +邝 +邙 +汀 +讦 +讧 +讪 +讫 +尻 +阡 +尕 +弁 +驭 +匡 +耒 +玎 +玑 +邢 +圩 +圬 +圭 +扦 +圪 +圳 +圹 +扪 +圮 +圯 +芊 +芍 +芄 +芨 +芑 +芎 +芗 +亘 +厍 +夼 +戍 +尥 +乩 +旯 +曳 +岌 +屺 +凼 +囡 +钇 +缶 +氘 +氖 +牝 +伎 +伛 +伢 +佤 +仵 +伥 +伧 +伉 +伫 +囟 +汆 +刖 +夙 +旮 +刎 +犷 +犸 +舛 +凫 +邬 +饧 +汕 +汔 +汐 +汲 +汜 +汊 +忖 +忏 +讴 +讵 +祁 +讷 +聿 +艮 +厾 +阱 +阮 +阪 +丞 +妁 +牟 +纡 +纣 +纥 +纨 +玕 +玙 +抟 +抔 +圻 +坂 +坍 +坞 +抃 +抉 +㧐 +芫 +邯 +芸 +芾 +苈 +苣 +芷 +芮 +苋 +芼 +苌 +苁 +芩 +芪 +芡 +芟 +苄 +苎 +苡 +杌 +杓 +杞 +杈 +忑 +孛 +邴 +邳 +矶 +奁 +豕 +忒 +欤 +轫 +迓 +邶 +忐 +卣 +邺 +旰 +呋 +呒 +呓 +呔 +呖 +呃 +旸 +吡 +町 +虬 +呗 +吽 +吣 +吲 +帏 +岐 +岈 +岘 +岑 +岚 +兕 +囵 +囫 +钊 +钋 +钌 +迕 +氙 +氚 +牤 +佞 +邱 +攸 +佚 +佝 +佟 +佗 +伽 +彷 +佘 +佥 +孚 +豸 +坌 +肟 +邸 +奂 +劬 +狄 +狁 +鸠 +邹 +饨 +饩 +饪 +饫 +饬 +亨 +庑 +庋 +疔 +疖 +肓 +闱 +闳 +闵 +羌 +炀 +沣 +沅 +沔 +沤 +沌 +沏 +沚 +汩 +汨 +沂 +汾 +沨 +汴 +汶 +沆 +沩 +泐 +怃 +怄 +忡 +忤 +忾 +怅 +忻 +忪 +怆 +忭 +忸 +诂 +诃 +诅 +诋 +诌 +诏 +诒 +孜 +陇 +陀 +陂 +陉 +妍 +妩 +妪 +妣 +妊 +妗 +妫 +妞 +姒 +妤 +邵 +劭 +刭 +甬 +邰 +纭 +纰 +纴 +纶 +纾 +玮 +玡 +玭 +玠 +玢 +玥 +玦 +盂 +忝 +匦 +坩 +抨 +拤 +坫 +拈 +垆 +抻 +劼 +拃 +拊 +坼 +坻 +㧟 +坨 +坭 +抿 +坳 +耶 +苷 +苯 +苤 +茏 +苫 +苜 +苴 +苒 +苘 +茌 +苻 +苓 +茚 +茆 +茑 +茓 +茔 +茕 +茀 +苕 +枥 +枇 +杪 +杳 +枧 +杵 +枨 +枞 +枋 +杻 +杷 +杼 +矸 +砀 +刳 +奄 +瓯 +殁 +郏 +轭 +郅 +鸢 +盱 +昊 +昙 +杲 +昃 +咂 +呸 +昕 +昀 +旻 +昉 +炅 +咔 +畀 +虮 +咀 +呷 +黾 +呱 +呤 +咚 +咆 +咛 +呶 +呣 +呦 +咝 +岢 +岿 +岬 +岫 +帙 +岣 +峁 +刿 +迥 +岷 +剀 +帔 +峄 +沓 +囹 +罔 +钍 +钎 +钏 +钒 +钕 +钗 +邾 +迮 +牦 +竺 +迤 +佶 +佬 +佰 +侑 +侉 +臾 +岱 +侗 +侃 +侏 +侩 +佻 +佾 +侪 +佼 +佯 +侬 +帛 +阜 +侔 +徂 +刽 +郄 +怂 +籴 +瓮 +戗 +肼 +䏝 +肽 +肱 +肫 +剁 +迩 +郇 +狙 +狎 +狍 +狒 +咎 +炙 +枭 +饯 +饴 +冽 +冼 +庖 +疠 +疝 +疡 +兖 +妾 +劾 +炜 +𬉼 +炖 +炘 +炝 +炔 +泔 +沭 +泷 +泸 +泱 +泅 +泗 +泠 +泺 +泖 +泫 +泮 +沱 +泯 +泓 +泾 +怙 +怵 +怦 +怛 +怏 +怍 +㤘 +怩 +怫 +怿 +宕 +穹 +宓 +诓 +诔 +诖 +诘 +戾 +诙 +戽 +郓 +衩 +祆 +祎 +祉 +祇 +诛 +诜 +诟 +诠 +诣 +诤 +诧 +诨 +诩 +戕 +孢 +亟 +陔 +妲 +妯 +姗 +帑 +弩 +孥 +驽 +虱 +迦 +迨 +绀 +绁 +绂 +驷 +驸 +绉 +绌 +驿 +骀 +甾 +珏 +珐 +珂 +珑 +玳 +珀 +顸 +珉 +珈 +拮 +垭 +挝 +垣 +挞 +垤 +赳 +贲 +垱 +垌 +郝 +垧 +垓 +挦 +垠 +茜 +荚 +荑 +贳 +荜 +莒 +茼 +茴 +茱 +莛 +荞 +茯 +荏 +荇 +荃 +荟 +荀 +茗 +荠 +茭 +茨 +垩 +荥 +荦 +荨 +荩 +剋 +荪 +茹 +荬 +荮 +柰 +栉 +柯 +柘 +栊 +柩 +枰 +栌 +柙 +枵 +柚 +枳 +柞 +柝 +栀 +柢 +栎 +枸 +柈 +柁 +枷 +柽 +剌 +酊 +郦 +甭 +砗 +砘 +砒 +斫 +砭 +砜 +奎 +耷 +虺 +殂 +殇 +殄 +殆 +轱 +轲 +轳 +轶 +轸 +虿 +毖 +觇 +尜 +哐 +眄 +眍 +𠳐 +郢 +眇 +眊 +眈 +禺 +哂 +咴 +曷 +昴 +昱 +昵 +咦 +哓 +哔 +畎 +毗 +呲 +胄 +畋 +畈 +虼 +虻 +盅 +咣 +哕 +剐 +郧 +咻 +囿 +咿 +哌 +哙 +哚 +咯 +咩 +咤 +哝 +哏 +哞 +峙 +峣 +罘 +帧 +峒 +峤 +峋 +峥 +贶 +钚 +钛 +钡 +钣 +钤 +钨 +钫 +钯 +氡 +氟 +牯 +郜 +秕 +秭 +竽 +笈 +笃 +俦 +俨 +俅 +俪 +叟 +垡 +牮 +俣 +俚 +皈 +俑 +俟 +逅 +徇 +徉 +舢 +俞 +郗 +俎 +郤 +爰 +郛 +瓴 +胨 +胪 +胛 +胂 +胙 +胍 +胗 +胝 +朐 +胫 +鸨 +匍 +狨 +狯 +飑 +狩 +狲 +訇 +逄 +昝 +饷 +饸 +饹 +胤 +孪 +娈 +弈 +奕 +庥 +疬 +疣 +疥 +疭 +庠 +竑 +彦 +飒 +闼 +闾 +闿 +阂 +羑 +迸 +籼 +酋 +炳 +炻 +炽 +炯 +烀 +炷 +烃 +洱 +洹 +洧 +洌 +浃 +洇 +洄 +洙 +涎 +洎 +洫 +浍 +洮 +洵 +浒 +浔 +浕 +洳 +恸 +恓 +恹 +恫 +恺 +恻 +恂 +恪 +恽 +宥 +扃 +衲 +衽 +衿 +袂 +祛 +祜 +祓 +祚 +诮 +祗 +祢 +诰 +诳 +鸩 +昶 +郡 +咫 +弭 +牁 +胥 +陛 +陟 +娅 +姮 +娆 +姝 +姣 +姘 +姹 +怼 +羿 +炱 +矜 +绔 +骁 +骅 +绗 +绛 +骈 +耖 +挈 +珥 +珙 +顼 +珰 +珩 +珧 +珣 +珞 +琤 +珲 +敖 +恚 +埔 +埕 +埘 +埙 +埚 +挹 +耆 +耄 +埒 +捋 +贽 +垸 +捃 +盍 +荸 +莆 +莳 +莴 +莪 +莠 +莓 +莜 +莅 +荼 +莩 +荽 +莸 +荻 +莘 +莎 +莞 +莨 +渇 +鸪 +莼 +栲 +栳 +郴 +桓 +桡 +桎 +桢 +桤 +梃 +栝 +桕 +桁 +桧 +桅 +栟 +桉 +栩 +逑 +逋 +彧 +鬲 +豇 +酐 +逦 +厝 +孬 +砝 +砹 +砺 +砧 +砷 +砟 +砼 +砥 +砣 +剞 +砻 +轼 +轾 +辂 +鸫 +趸 +龀 +鸬 +虔 +逍 +眬 +唛 +晟 +眩 +眙 +哧 +哽 +唔 +晁 +晏 +鸮 +趵 +趿 +畛 +蚨 +蚜 +蚍 +蚋 +蚬 +蚝 +蚧 +唢 +圄 +唣 +唏 +盎 +唑 +崂 +崃 +罡 +罟 +峪 +觊 +赅 +钰 +钲 +钴 +钵 +钹 +钺 +钽 +钼 +钿 +铀 +铂 +铄 +铆 +铈 +铉 +铊 +铋 +铌 +铍 +䥽 +铎 +氩 +氤 +氦 +毪 +舐 +秣 +秫 +盉 +笄 +笕 +笊 +笏 +笆 +俸 +倩 +俵 +偌 +俳 +俶 +倬 +倏 +恁 +倭 +倪 +俾 +倜 +隼 +隽 +倌 +倥 +臬 +皋 +郫 +倨 +衄 +颀 +徕 +舫 +釜 +奚 +衾 +胯 +胱 +胴 +胭 +脍 +胼 +朕 +脒 +胺 +鸱 +玺 +鸲 +狷 +猁 +狳 +猃 +狺 +逖 +桀 +袅 +饽 +凇 +栾 +挛 +亳 +疳 +疴 +疸 +疽 +痈 +疱 +痂 +痉 +衮 +凋 +颃 +恣 +旆 +旄 +旃 +阃 +阄 +訚 +阆 +恙 +粑 +朔 +郸 +烜 +烨 +烩 +烊 +剡 +郯 +烬 +涑 +浯 +涞 +涟 +娑 +涅 +涠 +浞 +涓 +浥 +涔 +浜 +浠 +浣 +浚 +悚 +悭 +悝 +悒 +悌 +悛 +宸 +窈 +剜 +诹 +冢 +诼 +袒 +袢 +祯 +诿 +谀 +谂 +谄 +谇 +屐 +屙 +陬 +勐 +奘 +牂 +蚩 +陲 +姬 +娠 +娌 +娉 +娲 +娩 +娴 +娣 +娓 +婀 +畚 +逡 +绠 +骊 +绡 +骋 +绥 +绦 +绨 +骎 +邕 +鸶 +彗 +耜 +焘 +舂 +琏 +琇 +麸 +揶 +埴 +埯 +捯 +掳 +掴 +埸 +埵 +赧 +埤 +捭 +逵 +埝 +堋 +堍 +掬 +鸷 +掖 +捽 +掊 +堉 +掸 +捩 +掮 +悫 +埭 +埽 +掇 +掼 +聃 +菁 +萁 +菘 +堇 +萘 +萋 +菽 +菖 +萜 +萸 +萑 +棻 +菔 +菟 +萏 +萃 +菏 +菹 +菪 +菅 +菀 +萦 +菰 +菡 +梵 +梿 +梏 +觋 +桴 +桷 +梓 +棁 +桫 +棂 +啬 +郾 +匮 +敕 +豉 +鄄 +酞 +酚 +戛 +硎 +硭 +硒 +硖 +硗 +硐 +硇 +硌 +鸸 +瓠 +匏 +厩 +龚 +殒 +殓 +殍 +赉 +雩 +辄 +堑 +眭 +眦 +啧 +晡 +晤 +眺 +眵 +眸 +圊 +喏 +喵 +啉 +勖 +晞 +唵 +晗 +冕 +啭 +畦 +趺 +啮 +跄 +蚶 +蛄 +蛎 +蛆 +蚰 +蛊 +圉 +蚱 +蛉 +蛏 +蚴 +啁 +啕 +唿 +啐 +唼 +唷 +啖 +啵 +啶 +啷 +唳 +唰 +啜 +帻 +崚 +崦 +帼 +崮 +崤 +崆 +赇 +赈 +赊 +铑 +铒 +铗 +铙 +铟 +铠 +铡 +铢 +铣 +铤 +铧 +铨 +铩 +铪 +铫 +铬 +铮 +铯 +铰 +铱 +铳 +铵 +铷 +氪 +牾 +鸹 +秾 +逶 +笺 +筇 +笸 +笪 +笮 +笠 +笥 +笤 +笳 +笾 +笞 +偾 +偃 +偕 +偈 +傀 +偬 +偻 +皑 +皎 +鸻 +徜 +舸 +舻 +舴 +舷 +龛 +翎 +脬 +脘 +脲 +匐 +猗 +猡 +猞 +猝 +斛 +猕 +馗 +馃 +馄 +鸾 +孰 +庹 +庾 +痔 +痍 +疵 +翊 +旌 +旎 +袤 +阇 +阈 +阉 +阊 +阋 +阍 +阏 +羟 +粝 +粕 +敝 +焐 +烯 +焓 +烽 +焖 +烷 +焗 +渍 +渚 +淇 +淅 +淞 +渎 +涿 +淖 +挲 +淠 +涸 +渑 +淦 +淝 +淬 +涪 +淙 +涫 +渌 +淄 +惬 +悻 +悱 +惝 +惘 +悸 +惆 +惚 +惇 +惮 +窕 +谌 +谏 +扈 +皲 +谑 +裆 +袷 +裉 +谒 +谔 +谕 +谖 +谗 +谙 +谛 +谝 +逯 +郿 +隈 +粜 +隍 +隗 +婧 +婊 +婕 +娼 +婢 +婵 +胬 +袈 +翌 +恿 +欸 +绫 +骐 +绮 +绯 +绱 +骒 +绲 +骓 +绶 +绺 +绻 +绾 +骖 +缁 +耠 +琫 +琵 +琶 +琪 +瑛 +琦 +琥 +琨 +靓 +琰 +琮 +琯 +琬 +琛 +琚 +辇 +鼋 +揳 +堞 +搽 +揸 +揠 +堙 +趄 +揖 +颉 +塄 +揿 +耋 +揄 +蛩 +蛰 +塆 +摒 +揆 +掾 +聒 +葑 +葚 +靰 +靸 +葳 +葺 +葸 +萼 +葆 +葩 +葶 +蒌 +萱 +戟 +葭 +楮 +棼 +椟 +棹 +椤 +棰 +赍 +椋 +椁 +椪 +棣 +椐 +鹁 +覃 +酤 +酢 +酡 +鹂 +厥 +殚 +殛 +雯 +雱 +辊 +辋 +椠 +辍 +辎 +斐 +睄 +睑 +睇 +睃 +戢 +喋 +嗒 +喃 +喱 +喹 +晷 +喈 +跖 +跗 +跞 +跚 +跎 +跏 +跆 +蛱 +蛲 +蛭 +蛳 +蛐 +蛔 +蛞 +蛴 +蛟 +蛘 +喁 +喟 +啾 +嗖 +喑 +嗟 +喽 +嗞 +喀 +喔 +喙 +嵘 +嵖 +崴 +遄 +詈 +嵎 +崽 +嵬 +嵛 +嵯 +嵝 +嵫 +幄 +嵋 +赕 +铻 +铼 +铿 +锃 +锂 +锆 +锇 +锉 +锏 +锑 +锒 +锔 +锕 +掣 +矬 +氰 +毳 +毽 +犊 +犄 +犋 +鹄 +犍 +嵇 +黍 +稃 +稂 +筚 +筵 +筌 +傣 +傈 +舄 +牍 +傥 +傧 +遑 +傩 +遁 +徨 +媭 +畲 +弑 +颌 +翕 +釉 +鹆 +舜 +貂 +腈 +腌 +腓 +腆 +腴 +腑 +腚 +腱 +鱿 +鲀 +鲂 +颍 +猢 +猹 +猥 +飓 +觞 +觚 +猱 +颎 +飧 +馇 +馊 +亵 +脔 +裒 +痣 +痨 +痦 +痞 +痤 +痫 +痧 +赓 +竦 +瓿 +啻 +颏 +鹇 +阑 +阒 +阕 +粞 +遒 +孳 +焯 +焜 +焙 +焱 +鹈 +湛 +渫 +湮 +湎 +湜 +渭 +湍 +湫 +溲 +湟 +溆 +湲 +湔 +湉 +渥 +湄 +滁 +愠 +惺 +愦 +惴 +愀 +愎 +愔 +喾 +寐 +谟 +扉 +裢 +裎 +裥 +祾 +祺 +谠 +幂 +谡 +谥 +谧 +遐 +孱 +弼 +巽 +骘 +媪 +媛 +婷 +巯 +翚 +皴 +婺 +骛 +缂 +缃 +缄 +彘 +缇 +缈 +缌 +缑 +缒 +缗 +飨 +耢 +瑚 +瑁 +瑜 +瑗 +瑄 +瑕 +遨 +骜 +韫 +髡 +塬 +鄢 +趔 +趑 +摅 +摁 +蜇 +搋 +搪 +搐 +搛 +搠 +摈 +彀 +毂 +搦 +搡 +蓁 +戡 +蓍 +鄞 +靳 +蓐 +蓦 +鹋 +蒽 +蓓 +蓖 +蓊 +蒯 +蓟 +蓑 +蒿 +蒺 +蓠 +蒟 +蒡 +蒹 +蒴 +蒗 +蓥 +颐 +楔 +楠 +楂 +楝 +楫 +楸 +椴 +槌 +楯 +皙 +榈 +槎 +榉 +楦 +楣 +楹 +椽 +裘 +剽 +甄 +酮 +酰 +酯 +酩 +蜃 +碛 +碓 +硼 +碉 +碚 +碇 +碜 +鹌 +辏 +龃 +龅 +訾 +粲 +虞 +睚 +嗪 +韪 +嗷 +嗉 +睨 +睢 +雎 +睥 +嘟 +嗑 +嗫 +嗬 +嗔 +嗝 +戥 +嗄 +煦 +暄 +遢 +暌 +跬 +跶 +跸 +跐 +跣 +跹 +跻 +蛸 +蜊 +蜍 +蜉 +蜣 +畹 +蛹 +嗣 +嗯 +嗥 +嗲 +嗳 +嗌 +嗍 +嗨 +嗐 +嗤 +嗵 +罨 +嵊 +嵩 +嵴 +骰 +锗 +锛 +锜 +锝 +锞 +锟 +锢 +锨 +锩 +锭 +锱 +雉 +氲 +犏 +歃 +稞 +稗 +稔 +筠 +筢 +筮 +筲 +筱 +牒 +煲 +敫 +徭 +愆 +艄 +觎 +毹 +貊 +貅 +貉 +颔 +腠 +腩 +腼 +腭 +腧 +塍 +媵 +詹 +鲅 +鲆 +鲇 +鲈 +稣 +鲋 +鲐 +肄 +鹐 +飕 +觥 +遛 +馐 +鹑 +亶 +瘃 +痱 +痼 +痿 +瘐 +瘁 +瘆 +麂 +裔 +歆 +旒 +雍 +阖 +阗 +阙 +羧 +豢 +粳 +猷 +煳 +煜 +煨 +煅 +煊 +煸 +煺 +滟 +溱 +溘 +漭 +滢 +溥 +溧 +溽 +裟 +溻 +溷 +滗 +滫 +溴 +滏 +滃 +滦 +溏 +滂 +滓 +溟 +滪 +愫 +慑 +慊 +鲎 +骞 +窦 +窠 +窣 +裱 +褚 +裨 +裾 +裰 +禊 +谩 +谪 +媾 +嫫 +媲 +嫒 +嫔 +媸 +缙 +缜 +缛 +辔 +骝 +缟 +缡 +缢 +缣 +骟 +耥 +璈 +瑶 +瑭 +獒 +觏 +慝 +嫠 +韬 +叆 +髦 +摽 +墁 +撂 +摞 +撄 +翥 +踅 +摭 +墉 +墒 +榖 +綦 +蔫 +蔷 +靺 +靼 +鞅 +靿 +甍 +蔸 +蔟 +蔺 +戬 +蕖 +蔻 +蓿 +斡 +鹕 +蓼 +榛 +榧 +榻 +榫 +榭 +槔 +榱 +槁 +槟 +槠 +榷 +僰 +酽 +酶 +酹 +厮 +碡 +碴 +碣 +碲 +磋 +臧 +豨 +殡 +霆 +霁 +辕 +蜚 +裴 +翡 +龇 +龈 +睿 +䁖 +睽 +嘞 +嘈 +嘌 +嘁 +嘎 +暧 +暝 +踌 +踉 +蜞 +蜥 +蜮 +蝈 +蜴 +蜱 +蜩 +蜷 +蜿 +螂 +蜢 +嘘 +嘡 +鹗 +嘣 +嘤 +嘚 +嗾 +嘧 +罴 +罱 +幔 +嶂 +幛 +赙 +罂 +骷 +骶 +鹘 +锲 +锴 +锶 +锷 +锸 +锵 +镁 +镂 +犒 +箐 +箦 +箧 +箍 +箸 +箬 +箅 +箪 +箔 +箜 +箢 +箓 +毓 +僖 +儆 +僳 +僭 +劁 +僮 +魃 +魆 +睾 +艋 +鄱 +膈 +膑 +鲑 +鲔 +鲚 +鲛 +鲟 +獐 +觫 +雒 +夤 +馑 +銮 +塾 +麽 +瘌 +瘊 +瘘 +瘙 +廖 +韶 +旖 +膂 +阚 +鄯 +鲞 +粿 +粼 +粽 +糁 +槊 +鹚 +熘 +熥 +潢 +漕 +滹 +漯 +漶 +潋 +潴 +漪 +漉 +漳 +漩 +澉 +潍 +慵 +搴 +窨 +寤 +綮 +谮 +褡 +褙 +褓 +褛 +褊 +谯 +谰 +谲 +暨 +屣 +鹛 +嫣 +嫱 +嫖 +嫦 +嫚 +嫘 +嫡 +鼐 +翟 +瞀 +鹜 +骠 +缥 +缦 +缧 +缨 +骢 +缪 +缫 +耦 +耧 +瑾 +璜 +璀 +璎 +璁 +璋 +璇 +奭 +髯 +髫 +撷 +撅 +赭 +撸 +鋆 +撙 +撺 +墀 +聩 +觐 +鞑 +蕙 +鞒 +蕈 +蕨 +蕤 +蕞 +蕺 +瞢 +蕃 +蕲 +赜 +槿 +樯 +槭 +樗 +樘 +樊 +槲 +醌 +醅 +靥 +魇 +餍 +磔 +磙 +霈 +辘 +龉 +龊 +觑 +瞌 +瞋 +瞑 +嘭 +噎 +噶 +颙 +暹 +噘 +踔 +踝 +踟 +踒 +踬 +踮 +踯 +踺 +踞 +蝽 +蝾 +蝻 +蝰 +蝮 +螋 +蝓 +蝣 +蝼 +噗 +嘬 +颚 +噍 +噢 +噙 +噜 +噌 +噔 +颛 +幞 +幡 +嶙 +嶝 +骺 +骼 +骸 +镊 +镉 +镌 +镍 +镏 +镒 +镓 +镔 +稷 +箴 +篑 +篁 +篌 +篆 +牖 +儋 +徵 +磐 +虢 +鹞 +膘 +滕 +鲠 +鲡 +鲢 +鲣 +鲥 +鲧 +鲩 +獗 +獠 +觯 +馓 +馔 +麾 +廛 +瘛 +瘼 +瘢 +瘠 +齑 +羯 +羰 +𥻗 +遴 +糌 +糍 +糅 +熜 +熵 +熠 +澍 +澌 +潸 +潦 +潲 +鋈 +潟 +潼 +潺 +憬 +憧 +寮 +窳 +谳 +褴 +褟 +褫 +谵 +熨 +屦 +嬉 +勰 +戮 +蝥 +缬 +缮 +缯 +骣 +畿 +耩 +耨 +耪 +璞 +璟 +靛 +璠 +璘 +聱 +螯 +髻 +髭 +髹 +擀 +熹 +甏 +擞 +縠 +磬 +颞 +蕻 +鞘 +颟 +薤 +薨 +檠 +薏 +薮 +薜 +薅 +樾 +橛 +橇 +樵 +檎 +橹 +樽 +樨 +橼 +墼 +橐 +翮 +醛 +醐 +醍 +醚 +磲 +赝 +飙 +殪 +霖 +霏 +霓 +錾 +辚 +臻 +遽 +氅 +瞟 +瞠 +瞰 +嚄 +嚆 +噤 +暾 +蹀 +踹 +踵 +踽 +蹉 +蹁 +螨 +蟒 +螈 +螅 +螭 +螠 +螟 +噱 +噬 +噫 +噻 +噼 +罹 +圜 +䦃 +镖 +镗 +镘 +镚 +镛 +镝 +镞 +镠 +氇 +氆 +憩 +穑 +篝 +篥 +篦 +篪 +篙 +盥 +劓 +翱 +魉 +魈 +徼 +歙 +膳 +膦 +膙 +鲮 +鲱 +鲲 +鲳 +鲴 +鲵 +鲷 +鲻 +獴 +獭 +獬 +邂 +鹧 +廨 +赟 +瘰 +廪 +瘿 +瘵 +瘴 +癃 +瘳 +斓 +麇 +麈 +嬴 +壅 +羲 +糗 +瞥 +甑 +燎 +燠 +燔 +燧 +濑 +濉 +潞 +澧 +澹 +澥 +澶 +濂 +褰 +寰 +窸 +褶 +禧 +嬖 +犟 +隰 +嬗 +颡 +缱 +缲 +缳 +璨 +璩 +璐 +璪 +螫 +擤 +壕 +觳 +罄 +擢 +薹 +鞡 +鞬 +薷 +薰 +藓 +藁 +檄 +檩 +懋 +醢 +翳 +礅 +磴 +鹩 +龋 +龌 +豳 +壑 +黻 +嚏 +嚅 +蹑 +蹒 +蹊 +蟥 +螬 +螵 +疃 +螳 +蟑 +嚓 +羁 +罽 +罾 +嶷 +黜 +黝 +髁 +髀 +镡 +镢 +镣 +镦 +镧 +镩 +镪 +镫 +罅 +黏 +簌 +篾 +篼 +簖 +簋 +鼢 +黛 +儡 +鹪 +鼾 +皤 +魍 +龠 +繇 +貘 +邈 +貔 +臌 +膻 +臆 +臃 +鲼 +鲽 +鳀 +鳃 +鳅 +鳇 +鳊 +螽 +燮 +鹫 +襄 +糜 +縻 +膺 +癍 +麋 +懑 +濡 +濮 +濞 +濠 +濯 +蹇 +謇 +邃 +襁 +檗 +擘 +孺 +隳 +嬷 +蟊 +鹬 +鍪 +鏊 +鳌 +鬈 +鬃 +瞽 +鞯 +鞨 +鞫 +鞧 +鞣 +藜 +藠 +藩 +醪 +蹙 +礓 +燹 +餮 +瞿 +曛 +颢 +曜 +躇 +蹚 +鹭 +蟛 +蟪 +蟠 +蟮 +鹮 +黠 +黟 +髅 +髂 +镬 +镭 +镯 +馥 +簟 +簪 +鼬 +雠 +艟 +鳎 +鳏 +鳐 +癞 +癔 +癜 +癖 +糨 +蹩 +鎏 +懵 +彝 +邋 +鬏 +攉 +攒 +鞲 +鞴 +藿 +蘧 +蘅 +麓 +醮 +醯 +酃 +霪 +霭 +霨 +黼 +嚯 +蹰 +蹶 +蹽 +蹼 +蹴 +蹾 +蹿 +蠖 +蠓 +蟾 +蠊 +黢 +髋 +髌 +镲 +籀 +籁 +齁 +魑 +艨 +鳓 +鳔 +鳕 +鳗 +鳙 +麒 +鏖 +羸 +㸆 +瀚 +瀣 +瀛 +襦 +谶 +襞 +骥 +缵 +瓒 +攘 +蘩 +蘖 +醴 +霰 +酆 +矍 +曦 +躅 +鼍 +巉 +黩 +黥 +黪 +镳 +镴 +黧 +纂 +璺 +鼯 +臜 +鳜 +鳝 +鳟 +獾 +孀 +骧 +瓘 +鼙 +醺 +礴 +颦 +曩 +鳢 +癫 +麝 +夔 +爝 +灏 +禳 +鐾 +羼 +蠡 +耱 +懿 +蘸 +鹳 +霾 +氍 +饕 +躐 +髑 +镵 +穰 +饔 +鬻 +鬟 +趱 +攫 +攥 +颧 +躜 +鼹 +癯 +麟 +蠲 +蠹 +躞 +衢 +鑫 +灞 +襻 +纛 +鬣 +攮 +囔 +馕 +戆 +爨 +齉 +亍 +尢 +彳 +卬 +殳 +𠙶 +毌 +邘 +戋 +圢 +氕 +伋 +仝 +冮 +氿 +汈 +氾 +忉 +宄 +讱 +扞 +圲 +圫 +芏 +芃 +朳 +朸 +𨙸 +邨 +吒 +吖 +屼 +屾 +辿 +钆 +仳 +伣 +伈 +癿 +甪 +邠 +犴 +冱 +邡 +闫 +汋 +䜣 +讻 +孖 +纩 +玒 +玓 +玘 +玚 +刬 +坜 +坉 +扽 +坋 +扺 +㧑 +毐 +芰 +芣 +苊 +苉 +芘 +芴 +芠 +芤 +杕 +杙 +杄 +杧 +杩 +尪 +尨 +轪 +坒 +芈 +旴 +旵 +呙 +㕮 +岍 +岠 +岜 +呇 +冏 +觃 +岙 +伾 +㑇 +伭 +佖 +伲 +佁 +飏 +狃 +闶 +汧 +汫 +𣲘 +𣲗 +沄 +沘 +汭 +㳇 +沇 +忮 +忳 +忺 +祃 +诇 +邲 +诎 +诐 +屃 +岊 +阽 +䢺 +阼 +妧 +妘 +𨚕 +纮 +驲 +纻 +纼 +玤 +玞 +玱 +玟 +邽 +邿 +坥 +坰 +坬 +坽 +弆 +耵 +䢼 +𦭜 +茋 +苧 +苾 +苠 +枅 +㭎 +枘 +枍 +矼 +矻 +匼 +旿 +昇 +昄 +昒 +昈 +咉 +咇 +咍 +岵 +岽 +岨 +岞 +峂 +㟃 +囷 +钐 +钔 +钖 +牥 +佴 +垈 +侁 +侹 +佸 +佺 +隹 +㑊 +侂 +佽 +侘 +郈 +舠 +郐 +郃 +攽 +肭 +肸 +肷 +狉 +狝 +饳 +忞 +於 +炌 +炆 +泙 +沺 +泂 +泜 +泃 +泇 +怊 +峃 +穸 +祋 +祊 +鸤 +弢 +弨 +陑 +陎 +卺 +乸 +妭 +姈 +迳 +叕 +驵 +䌹 +驺 +绋 +绐 +砉 +耔 +㛃 +玶 +珇 +珅 +珋 +玹 +珌 +玿 +韨 +垚 +垯 +垙 +垲 +埏 +垍 +耇 +垎 +垴 +垟 +垞 +挓 +垵 +垏 +拶 +荖 +荁 +荙 +荛 +茈 +茽 +荄 +茺 +荓 +茳 +𦰡 +茛 +荭 +㭕 +柷 +柃 +柊 +枹 +栐 +柖 +郚 +剅 +䴓 +迺 +厖 +砆 +砑 +砄 +耏 +奓 +䶮 +轵 +轷 +轹 +轺 +昺 +昽 +盷 +咡 +咺 +昳 +昣 +哒 +昤 +昫 +昡 +咥 +昪 +虷 +虸 +哃 +峘 +耑 +峛 +峗 +峧 +帡 +钘 +钜 +钪 +钬 +钭 +矧 +秬 +俫 +舁 +俜 +俙 +俍 +垕 +衎 +舣 +弇 +侴 +鸧 +䏡 +胠 +𦙶 +胈 +胩 +胣 +朏 +飐 +訄 +饻 +庤 +疢 +炣 +炟 +㶲 +洭 +洘 +洓 +洿 +㳚 +泚 +浈 +浉 +洸 +洑 +洢 +洈 +洚 +洺 +洨 +浐 +㳘 +洴 +洣 +恔 +宬 +窀 +扂 +袆 +祏 +祐 +祕 +叚 +陧 +陞 +娀 +姞 +姱 +姤 +姶 +姽 +枲 +绖 +骃 +彖 +骉 +恝 +珪 +珛 +珹 +琊 +玼 +珖 +珽 +珦 +珫 +珒 +珢 +珕 +珝 +埗 +垾 +垺 +埆 +垿 +埌 +埇 +莰 +茝 +鄀 +莶 +莝 +䓖 +莙 +栻 +桠 +桄 +梠 +栴 +梴 +栒 +酎 +酏 +砵 +砠 +砫 +砬 +硁 +恧 +翃 +郪 +𨐈 +辀 +辁 +剕 +赀 +哢 +晅 +晊 +唝 +哳 +哱 +冔 +晔 +晐 +晖 +畖 +蚄 +蚆 +帱 +崁 +峿 +崄 +帨 +崀 +赆 +钷 +眚 +甡 +笫 +倻 +倴 +脩 +倮 +倕 +倞 +倓 +倧 +衃 +虒 +舭 +舯 +舥 +瓞 +鬯 +鸰 +脎 +朓 +胲 +虓 +鱽 +狴 +峱 +狻 +眢 +勍 +痄 +疰 +痃 +竘 +羖 +羓 +桊 +敉 +烠 +烔 +烶 +烻 +涍 +浡 +浭 +浬 +涄 +涢 +涐 +浰 +浟 +浛 +浼 +浲 +涘 +悈 +悃 +悢 +宧 +窅 +窊 +窎 +扅 +扆 +袪 +袗 +袯 +祧 +隺 +堲 +疍 +𨺙 +陴 +烝 +砮 +㛚 +哿 +翀 +翂 +剟 +绤 +骍 +䂮 +琎 +珸 +珵 +琄 +琈 +琀 +珺 +掭 +堎 +堐 +埼 +掎 +埫 +堌 +晢 +掞 +埪 +壸 +㙍 +聍 +菝 +萚 +菥 +莿 +䓫 +勚 +䓬 +萆 +菂 +菍 +菼 +萣 +䓨 +菉 +䓛 +梼 +梽 +桲 +梾 +桯 +梣 +梌 +桹 +敔 +厣 +硔 +硙 +硚 +硊 +硍 +勔 +䴕 +龁 +逴 +唪 +啫 +翈 +㫰 +晙 +畤 +趼 +跂 +蛃 +蚲 +蚺 +啴 +䎃 +崧 +崟 +崞 +崒 +崌 +崡 +铏 +铕 +铖 +铘 +铚 +铞 +铥 +铴 +牻 +牿 +稆 +笱 +笯 +偰 +偡 +鸺 +偭 +偲 +偁 +㿠 +鄅 +偓 +徛 +衒 +舳 +舲 +鸼 +悆 +鄃 +瓻 +䝙 +脶 +脞 +脟 +䏲 +鱾 +猇 +猊 +猄 +觖 +𠅤 +庱 +庼 +庳 +痓 +䴔 +竫 +堃 +阌 +羝 +羕 +焆 +烺 +焌 +淏 +淟 +淜 +淴 +淯 +湴 +涴 +㥄 +惛 +惔 +悰 +惙 +寁 +逭 +袼 +裈 +祲 +谞 +艴 +弸 +弶 +隃 +婞 +娵 +婼 +媖 +婳 +婍 +婌 +婫 +婤 +婘 +婠 +绹 +骕 +絜 +珷 +琲 +琡 +琟 +琔 +琭 +堾 +堼 +揕 +㙘 +堧 +喆 +堨 +塅 +堠 +絷 +𡎚 +葜 +惎 +萳 +葙 +靬 +葴 +蒇 +蒈 +鄚 +蒉 +蓇 +萩 +蒐 +葰 +葎 +鄑 +蒎 +葖 +蒄 +萹 +棤 +棽 +棫 +椓 +椑 +鹀 +椆 +棓 +棬 +棪 +椀 +楗 +甦 +酦 +觌 +奡 +皕 +硪 +欹 +詟 +辌 +棐 +龂 +黹 +牚 +睎 +晫 +晪 +晱 +𧿹 +蛑 +畯 +斝 +喤 +崶 +嵁 +崾 +嵅 +崿 +嵚 +翙 +圌 +圐 +赑 +淼 +赒 +铹 +铽 +𨱇 +锊 +锍 +锎 +锓 +犇 +颋 +稌 +筀 +筘 +筜 +筥 +筅 +傃 +傉 +翛 +傒 +傕 +舾 +畬 +脿 +腘 +䐃 +腙 +腒 +鲃 +猰 +猯 +㺄 +馉 +鄗 +廋 +廆 +鄌 +粢 +遆 +旐 +焞 +欻 +𣸣 +溚 +溁 +湝 +渰 +湓 +㴔 +渟 +溠 +渼 +溇 +湣 +湑 +溞 +愐 +愃 +敩 +甯 +棨 +扊 +裣 +祼 +婻 +媆 +媞 +㛹 +媓 +媂 +媄 +毵 +矞 +缊 +缐 +骙 +瑃 +瑓 +瑅 +瑆 +䴖 +瑖 +瑝 +瑔 +瑀 +𤧛 +瑳 +瑂 +嶅 +瑑 +遘 +髢 +塥 +堽 +赪 +摛 +塝 +搒 +搌 +蒱 +蒨 +蓏 +蔀 +蓢 +蓂 +蒻 +蓣 +椹 +楪 +榃 +榅 +楒 +楞 +楩 +榇 +椸 +楙 +歅 +碃 +碏 +碈 +䃅 +硿 +鄠 +辒 +龆 +觜 +䣘 +暕 +鹍 +㬊 +暅 +跱 +蜐 +蜎 +嵲 +赗 +骱 +锖 +锘 +锳 +锧 +锪 +锫 +锬 +稑 +稙 +䅟 +筻 +筼 +筶 +筦 +筤 +傺 +鹎 +僇 +艅 +艉 +谼 +貆 +腽 +腨 +腯 +鲉 +鲊 +鲌 +䲟 +鲏 +雊 +猺 +飔 +觟 +𦝼 +馌 +裛 +廒 +瘀 +瘅 +鄘 +鹒 +鄜 +麀 +鄣 +阘 +煁 +煃 +煴 +煋 +煟 +煓 +滠 +溍 +溹 +滆 +滉 +溦 +溵 +漷 +滧 +滘 +滍 +愭 +慥 +慆 +塱 +裼 +禋 +禔 +禘 +禒 +谫 +鹔 +愍 +嫄 +媱 +戤 +戣 +缞 +耤 +瑧 +瑨 +瑱 +瑷 +瑢 +斠 +摏 +墕 +墈 +墐 +墘 +摴 +銎 +𡐓 +墚 +撖 +靽 +鞁 +蔌 +蔈 +蓰 +蔹 +蔊 +嘏 +榰 +榑 +槚 +𣗋 +槜 +榍 +疐 +酺 +酾 +酲 +酴 +碶 +䃎 +碨 +𥔲 +碹 +碥 +劂 +䴗 +夥 +瞍 +鹖 +㬎 +跽 +蜾 +幖 +嶍 +圙 +𨱏 +锺 +锼 +锽 +锾 +锿 +镃 +镄 +镅 +馝 +鹙 +箨 +箖 +劄 +僬 +僦 +僔 +僎 +槃 +㙦 +鲒 +鲕 +鲖 +鲗 +鲘 +鲙 +𩽾 +夐 +獍 +飗 +凘 +廑 +廙 +瘗 +瘥 +瘕 +鲝 +鄫 +熇 +漹 +漖 +潆 +漤 +潩 +漼 +漴 +㽏 +漈 +漋 +漻 +慬 +窬 +窭 +㮾 +褕 +禛 +禚 +隩 +嫕 +嫭 +嫜 +嫪 +㻬 +麹 +璆 +漦 +叇 +墣 +墦 +墡 +劐 +薁 +蕰 +蔃 +鼒 +槱 +鹝 +磏 +磉 +殣 +慭 +霅 +暵 +暲 +暶 +踦 +踣 +䗖 +蝘 +蝲 +蝤 +噇 +噂 +噀 +罶 +嶲 +嶓 +㠇 +嶟 +嶒 +镆 +镈 +镋 +镎 +镕 +稹 +儇 +皞 +皛 +䴘 +艎 +艏 +鹟 +𩾃 +鲦 +鲪 +鲬 +橥 +觭 +鹠 +鹡 +糇 +糈 +翦 +鹢 +鹣 +熛 +潖 +潵 +㵐 +澂 +澛 +瑬 +潽 +潾 +潏 +憭 +憕 +戭 +褯 +禤 +嫽 +遹 +璥 +璲 +璒 +憙 +擐 +鄹 +薳 +鞔 +黇 +蕗 +薢 +蕹 +橞 +橑 +橦 +醑 +觱 +磡 +𥕢 +磜 +豮 +鹾 +虤 +暿 +曌 +曈 +㬚 +蹅 +踶 +䗛 +螗 +疁 +㠓 +幪 +嶦 +𨱑 +馞 +穄 +篚 +篯 +簉 +鼽 +衠 +盦 +螣 +縢 +鲭 +鲯 +鲰 +鲺 +鲹 +亸 +癀 +瘭 +羱 +糒 +燋 +熻 +燊 +燚 +燏 +濩 +濋 +澪 +澽 +澴 +澭 +澼 +憷 +憺 +懔 +黉 +嬛 +鹨 +翯 +璱 +𤩽 +璬 +璮 +髽 +擿 +薿 +薸 +檑 +櫆 +檞 +醨 +繄 +磹 +磻 +瞫 +瞵 +蹐 +蟏 +㘎 +镤 +镥 +镨 +𨱔 +矰 +穙 +穜 +穟 +簕 +簃 +簏 +儦 +魋 +斶 +艚 +谿 +䲠 +鲾 +鲿 +鳁 +鳂 +鳈 +鳉 +獯 +䗪 +馘 +襕 +襚 +螱 +甓 +嬬 +嬥 +𦈡 +瓀 +釐 +鬶 +爇 +鞳 +鞮 +藟 +藦 +藨 +鹲 +檫 +黡 +礞 +礌 +𥖨 +蹢 +蹜 +蟫 +䗴 +嚚 +髃 +镮 +镱 +酂 +馧 +簠 +簝 +簰 +鼫 +鼩 +皦 +臑 +䲢 +鳑 +鳒 +鹱 +鹯 +癗 +𦒍 +旞 +翷 +冁 +䎖 +瀔 +瀍 +瀌 +襜 +䴙 +嚭 +㰀 +鬷 +醭 +蹯 +蠋 +翾 +鳘 +儳 +儴 +鼗 +𩾌 +鳚 +鳛 +麑 +麖 +蠃 +彟 +嬿 +鬒 +蘘 +欂 +醵 +颥 +甗 +𨟠 +巇 +酅 +髎 +犨 +𨭉 +㸌 +爔 +瀱 +瀹 +瀼 +瀵 +襫 +孅 +骦 +耰 +𤫉 +瓖 +鬘 +趯 +罍 +鼱 +鳠 +鳡 +鳣 +爟 +爚 +灈 +韂 +糵 +蘼 +礵 +鹴 +躔 +皭 +龢 +鳤 +亹 +籥 +鼷 +玃 +醾 +齇 +觿 +蠼 +𬣙 +𬇕 +𬣞 +𬘓 +𫭟 +𫭢 +𫇭 +𫐄 +𫵷 +𬇙 +𬣡 +𫸩 +𫘜 +𬘘 +𫘝 +𬨂 +𬀩 +𬀪 +𬬩 +𫍣 +𬣳 +𬩽 +𬮿 +𬯀 +𫰛 +𬳵 +𬳶 +𫠊 +𬍛 +鿍 +𬜬 +𪾢 +𪨰 +𫓧 +𬬮 +𬬱 +𬬭 +𬘡 +𬳽 +𬘩 +𫄧 +𪟝 +𬍤 +𫭼 +𬜯 +𬂩 +𫠆 +𬌗 +𫑡 +𪨶 +𬬸 +𬬻 +𬬹 +𬬿 +𬭁 +𫢸 +𫗧 +𬊈 +𬒈 +𬳿 +𫄨 +𬘫 +𫮃 +鿎 +𬱖 +𬟽 +𫓯 +𫟹 +𫟼 +𬇹 +𬍡 +𬤇 +𫍯 +𬤊 +𫍲 +𬯎 +𬘬 +𬘭 +𬴂 +𫘦 +𫟅 +𬘯 +𫘧 +𪣻 +𬃊 +𬷕 +𫐐 +𬹼 +𫶇 +𫖮 +鿏 +𬭊 +𫓶 +𬭎 +𫖯 +𬱟 +𫛭 +𫷷 +𬮱 +𬊤 +𬴃 +𫘨 +𬪩 +𬒔 +𬨎 +𫐓 +𫫇 +𫓹 +𬭚 +𬭛 +𬕂 +𬶋 +𬶍 +𫔶 +𫌀 +𫖳 +𫘪 +𫘬 +𫞩 +𪤗 +𬸘 +𬒗 +𫚖 +𬭤 +𫚕 +𬶐 +𬶏 +𬸚 +𬤝 +𬙂 +𬭩 +𬸣 +𫍽 +𬴊 +𬞟 +𫟦 +𬺈 +𫠜 +𪩘 +𬭬 +𬭯 +𫗴 +𬸦 +𫄷 +𬭳 +𬭶 +𫔍 +𬭸 +𬭼 +𫔎 +𬸪 +𬶟 +𬶠 +𬶨 +𫄸 +𬟁 +𬙊 +𬶭 +𬶮 +𬙋 +𬺓 +𫚭 +廠 +蔔 +兒 +幾 +幹 +虧 +纔 +與 +萬 +韆 +億 +個 +廣 +門 +義 +衛 +飛 +習 +馬 +鄉 +豐 +開 +無 +雲 +專 +藝 +廳 +區 +歷 +曆 +車 +貝 +岡 +見 +氣 +長 +僕 +幣 +僅 +從 +侖 +倉 +風 +烏 +鳳 +爲 +鬥 +憶 +計 +訂 +認 +譏 +醜 +隊 +辦 +鄧 +勸 +雙 +書 +擊 +撲 +節 +術 +厲 +龍 +滅 +軋 +東 +盧 +業 +舊 +帥 +歸 +葉 +電 +號 +衹 +隻 +嘰 +嘆 +們 +儀 +叢 +爾 +樂 +處 +鼕 +鳥 +務 +飢 +饑 +馮 +閃 +蘭 +匯 +彙 +頭 +漢 +寧 +討 +寫 +讓 +禮 +訓 +議 +訊 +記 +齣 +遼 +邊 +發 +髮 +聖 +對 +臺 +颱 +檯 +糾 +絲 +動 +鞏 +執 +擴 +掃 +場 +揚 +亞 +樸 +機 +權 +過 +協 +壓 +厭 +頁 +誇 +奪 +達 +夾 +軌 +堯 +劃 +邁 +畢 +貞 +師 +塵 +當 +噹 +籲 +嚇 +蟲 +麯 +團 +糰 +嗎 +嶼 +歲 +迴 +豈 +則 +剛 +網 +硃 +遷 +喬 +偉 +傳 +優 +傷 +價 +倫 +華 +僞 +嚮 +後 +會 +殺 +閤 +衆 +爺 +傘 +創 +雜 +負 +壯 +衝 +妝 +莊 +慶 +劉 +齊 +産 +閉 +問 +闖 +關 +燈 +湯 +興 +講 +諱 +軍 +訝 +許 +訛 +論 +訟 +農 +諷 +設 +訪 +訣 +尋 +盡 +儘 +導 +孫 +陣 +陽 +階 +陰 +婦 +媽 +戲 +觀 +歡 +買 +紅 +馱 +纖 +縴 +馴 +約 +級 +紀 +馳 +紉 +壽 +麥 +瑪 +進 +遠 +違 +韌 +運 +撫 +壇 +罎 +壞 +摳 +擾 +貢 +垻 +壩 +摺 +掄 +搶 +墳 +護 +殻 +塊 +聲 +報 +擬 +蕪 +葦 +蒼 +嚴 +蘆 +勞 +蘇 +囌 +極 +楊 +兩 +麗 +醫 +勵 +還 +殲 +來 +連 +軒 +鹵 +滷 +堅 +時 +縣 +裏 +嘔 +園 +曠 +圍 +噸 +郵 +睏 +員 +聽 +嗆 +嗚 +彆 +嶇 +崗 +帳 +財 +針 +釘 +亂 +體 +傭 +徹 +餘 +穀 +鄰 +腸 +龜 +猶 +狽 +條 +島 +飯 +飲 +係 +繫 +凍 +狀 +畝 +庫 +療 +應 +這 +廬 +閏 +閑 +間 +悶 +竈 +燦 +瀝 +淪 +滄 +溝 +滬 +瀋 +懷 +憂 +窮 +證 +啓 +評 +補 +識 +詐 +訴 +診 +詞 +譯 +靈 +層 +遲 +張 +際 +陸 +陳 +墜 +勁 +鷄 +緯 +驅 +純 +紗 +綱 +納 +駁 +縱 +紛 +紙 +紋 +紡 +驢 +紐 +環 +責 +現 +錶 +規 +攏 +揀 +擔 +頂 +擁 +勢 +攔 +擰 +撥 +擇 +蘋 +範 +莖 +樞 +櫃 +闆 +鬆 +槍 +楓 +構 +喪 +畫 +棗 +賣 +鬱 +礬 +礦 +碼 +厠 +奮 +態 +歐 +毆 +壟 +轟 +頃 +轉 +斬 +輪 +軟 +齒 +虜 +腎 +賢 +國 +暢 +嚨 +鳴 +羅 +幟 +嶺 +凱 +敗 +賬 +販 +貶 +購 +貯 +圖 +釣 +製 +颳 +俠 +僥 +偵 +側 +憑 +僑 +貨 +質 +徑 +捨 +覓 +貪 +貧 +膚 +腫 +脹 +骯 +脅 +魚 +獰 +備 +飾 +飽 +飼 +變 +龐 +廟 +瘧 +劑 +廢 +閘 +鬧 +鄭 +捲 +單 +爐 +淺 +濘 +瀉 +潑 +澤 +憐 +學 +寶 +寵 +審 +簾 +實 +試 +詩 +誠 +襯 +視 +話 +誕 +詭 +詢 +該 +詳 +肅 +録 +隸 +彌 +瀰 +陝 +駕 +參 +艱 +綫 +練 +組 +紳 +細 +駛 +織 +駒 +終 +駐 +絆 +駝 +紹 +繹 +經 +貫 +貳 +幫 +項 +挾 +撓 +趙 +擋 +墊 +擠 +揮 +薦 +帶 +繭 +蕩 +榮 +葷 +熒 +鬍 +蔭 +藥 +標 +棧 +棟 +欄 +檸 +樹 +鹹 +磚 +硯 +麵 +牽 +鷗 +殘 +軸 +輕 +鴉 +戰 +點 +臨 +覽 +竪 +嘗 +啞 +顯 +貴 +蝦 +蟻 +螞 +雖 +駡 +勛 +嘩 +響 +喲 +峽 +罰 +賤 +貼 +貽 +鈣 +鈍 +鈔 +鍾 +鐘 +鋼 +鈉 +鑰 +欽 +鈞 +鈎 +鈕 +氈 +氫 +選 +適 +種 +鞦 +復 +複 +倆 +貸 +順 +儉 +須 +鬚 +劍 +朧 +膽 +勝 +狹 +獅 +獨 +獄 +貿 +餌 +饒 +蝕 +餃 +餅 +巒 +彎 +將 +奬 +瘡 +瘋 +親 +閨 +聞 +閩 +閥 +閣 +養 +薑 +類 +婁 +總 +煉 +爍 +爛 +窪 +潔 +灑 +澆 +濁 +測 +瀏 +濟 +渾 +濃 +惱 +舉 +覺 +憲 +竊 +誡 +誣 +語 +襖 +誤 +誘 +誨 +説 +誦 +墾 +晝 +費 +遜 +隕 +險 +嬌 +賀 +壘 +綁 +絨 +結 +繞 +驕 +繪 +給 +絢 +駱 +絡 +絶 +絞 +駭 +統 +艷 +蠶 +頑 +盞 +撈 +載 +趕 +鹽 +損 +撿 +摯 +剝 +熱 +搗 +壺 +聶 +萊 +蓮 +獲 +穫 +惡 +噁 +瑩 +鶯 +檔 +橋 +樺 +樁 +樣 +賈 +礫 +礎 +顧 +轎 +較 +頓 +斃 +緻 +慮 +監 +緊 +黨 +曬 +曉 +嘮 +鴨 +暈 +鴦 +罷 +圓 +賊 +賄 +賂 +贜 +錢 +鉗 +鑽 +鉀 +鐵 +鈴 +鉛 +犧 +敵 +積 +稱 +筆 +債 +傾 +賃 +艦 +艙 +聳 +愛 +頒 +頌 +臟 +髒 +臍 +膠 +腦 +膿 +鴕 +鴛 +皺 +餓 +餒 +戀 +槳 +漿 +準 +癥 +齋 +離 +資 +競 +閲 +煩 +燒 +燭 +遞 +濤 +澇 +渦 +塗 +滌 +潤 +澗 +漲 +燙 +澀 +憫 +寬 +傢 +賓 +竅 +請 +諸 +諾 +讀 +誹 +襪 +課 +誰 +調 +諒 +諄 +談 +誼 +懇 +劇 +難 +預 +絹 +綉 +驗 +繼 +駿 +瑣 +擲 +據 +摻 +職 +蘿 +螢 +營 +蕭 +薩 +夢 +檢 +醖 +碩 +聾 +襲 +輔 +輛 +顱 +懸 +躍 +纍 +囉 +嘯 +嶄 +邏 +嬰 +銬 +鐺 +鋁 +銅 +銘 +鏟 +銀 +矯 +穢 +籠 +償 +軀 +釁 +銜 +盤 +鴿 +斂 +領 +臉 +獵 +餡 +館 +癢 +鏇 +閻 +闡 +蓋 +斷 +獸 +鴻 +漸 +淵 +漁 +澱 +滲 +慚 +懼 +驚 +慘 +慣 +謀 +諜 +謊 +諧 +禱 +禍 +謂 +諺 +謎 +彈 +墮 +隨 +隱 +嬸 +頗 +頸 +績 +緒 +續 +騎 +綽 +繩 +維 +綿 +綳 +綢 +綜 +綻 +緑 +綴 +瓊 +趨 +攬 +攙 +擱 +摟 +攪 +聯 +蔣 +韓 +橢 +確 +頰 +靂 +暫 +翹 +輩 +鑿 +輝 +賞 +睞 +噴 +疇 +踐 +遺 +鵑 +賦 +賭 +贖 +賜 +賠 +鑄 +鋪 +鏈 +銷 +鎖 +鋤 +鍋 +銹 +鋒 +鋅 +鋭 +鵝 +築 +篩 +儲 +懲 +禦 +釋 +臘 +魯 +憊 +饋 +饞 +裝 +蠻 +闊 +糞 +滯 +濕 +潰 +濺 +灣 +憤 +竄 +窩 +褲 +禪 +謝 +謡 +謗 +謙 +屬 +屢 +緬 +纜 +緝 +緞 +緩 +締 +縷 +騙 +編 +騷 +緣 +鵡 +攝 +擺 +襬 +攤 +鵲 +藍 +濛 +懞 +矇 +獻 +欖 +樓 +賴 +礙 +尷 +霧 +輻 +輯 +輸 +頻 +齡 +鑒 +蹺 +蝸 +錯 +錨 +錫 +鑼 +錘 +錐 +錦 +鍵 +鋸 +錳 +辭 +頽 +籌 +簽 +籤 +簡 +膩 +鵬 +騰 +鮑 +穎 +觸 +雛 +饃 +餾 +醬 +謄 +糧 +數 +滿 +濾 +濫 +灕 +濱 +灘 +譽 +窺 +寢 +謹 +謬 +闢 +縛 +縫 +纏 +繽 +贅 +墻 +衊 +藹 +檻 +釀 +願 +轄 +輾 +顆 +踴 +蠟 +蠅 +蟬 +賺 +鍬 +鍛 +鍍 +穩 +籮 +簫 +輿 +鮮 +饅 +瀟 +賽 +譚 +譜 +騾 +縮 +攆 +聰 +藴 +櫻 +飄 +黴 +瞞 +題 +囑 +鎮 +鎬 +鎊 +簍 +鯉 +鯽 +癟 +癱 +顔 +鯊 +瀾 +額 +譴 +鶴 +繚 +顛 +轍 +鸚 +贈 +鏡 +贊 +籃 +籬 +鯨 +癮 +辯 +瀕 +懶 +繮 +繳 +矚 +贍 +鰐 +辮 +贏 +驟 +囂 +鐮 +鰭 +鷹 +巔 +顫 +癬 +鱉 +鬢 +鱗 +躪 +贛 +鑲 +韋 +閂 +訃 +勱 +芻 +鄺 +訐 +訌 +訕 +訖 +馭 +璣 +壙 +捫 +薌 +厙 +釔 +傴 +倀 +傖 +獷 +獁 +鳬 +鄔 +餳 +懺 +謳 +詎 +訥 +紆 +紂 +紇 +紈 +璵 +摶 +塢 +㩳 +蕓 +藶 +莧 +萇 +蓯 +磯 +奩 +歟 +軔 +鄴 +嘸 +囈 +嚦 +暘 +唄 +幃 +峴 +嵐 +圇 +釗 +釙 +釕 +僉 +鳩 +鄒 +飩 +餼 +飪 +飫 +飭 +廡 +癤 +闈 +閎 +閔 +煬 +灃 +漚 +渢 +潙 +憮 +慪 +愾 +悵 +愴 +詁 +訶 +詛 +詆 +謅 +詔 +詒 +隴 +陘 +嫵 +嫗 +嬀 +剄 +紜 +紕 +紝 +綸 +紓 +瑋 +匭 +壚 +擓 +蘢 +蔦 +塋 +煢 +櫪 +梘 +棖 +樅 +碭 +甌 +郟 +軛 +鳶 +曇 +蟣 +黽 +嚀 +噝 +巋 +劌 +剴 +嶧 +釷 +釺 +釧 +釩 +釹 +釵 +儈 +儕 +儂 +劊 +慫 +糴 +戧 +膞 +邇 +梟 +餞 +飴 +癘 +瘍 +煒 +熰 +熗 +瀧 +瀘 +濼 +涇 +㥮 +懌 +誆 +誄 +詿 +詰 +詼 +鄆 +禕 +誅 +詵 +詬 +詮 +詣 +諍 +詫 +諢 +詡 +駑 +紺 +紲 +紱 +駟 +駙 +縐 +絀 +驛 +駘 +瓏 +頇 +埡 +撾 +撻 +賁 +壋 +撏 +莢 +貰 +蓽 +蕎 +薈 +薺 +堊 +滎 +犖 +蕁 +藎 +蓀 +蕒 +葤 +櫛 +櫳 +櫨 +櫟 +檉 +酈 +硨 +碸 +殤 +軲 +軻 +轤 +軼 +軫 +蠆 +覘 +瞘 +嘵 +嗶 +噦 +剮 +鄖 +噲 +噥 +嶢 +幀 +嶠 +貺 +鈈 +鈦 +鋇 +鈑 +鈐 +鎢 +鈁 +鈀 +篤 +儔 +儼 +儷 +腖 +臚 +脛 +鴇 +獪 +颮 +猻 +餉 +餄 +餎 +孿 +孌 +癧 +瘲 +颯 +闥 +閭 +闓 +閡 +熾 +烴 +浹 +澮 +滸 +潯 +濜 +慟 +懨 +愷 +惻 +惲 +誚 +禰 +誥 +誑 +鴆 +婭 +嬈 +懟 +絝 +驍 +驊 +絎 +絳 +駢 +頊 +璫 +琿 +塒 +塤 +堝 +贄 +蒔 +萵 +蕕 +鴣 +蒓 +橈 +楨 +榿 +檜 +邐 +礪 +礱 +軾 +輊 +輅 +鶇 +躉 +齔 +鸕 +矓 +嘜 +鴞 +蜆 +嗩 +嶗 +崍 +覬 +賅 +鈺 +鉦 +鈷 +鉢 +鈸 +鉞 +鉭 +鉬 +鈿 +鈾 +鉑 +鑠 +鉚 +鈰 +鉉 +鉈 +鉍 +鈮 +鈹 +鏺 +鐸 +氬 +筧 +頎 +徠 +膾 +鴟 +璽 +鴝 +獫 +裊 +餑 +欒 +攣 +癰 +痙 +頏 +閫 +鬮 +誾 +閬 +鄲 +燁 +燴 +燼 +淶 +漣 +潿 +慳 +諏 +諑 +禎 +諉 +諛 +諗 +諂 +誶 +媧 +嫻 +綆 +驪 +綃 +騁 +綏 +縧 +綈 +駸 +鷥 +燾 +璉 +麩 +擄 +摑 +鷙 +撣 +慤 +摜 +縈 +槤 +覡 +欞 +嗇 +匱 +硤 +磽 +鴯 +龔 +殞 +殮 +賚 +輒 +塹 +嘖 +囀 +嚙 +蹌 +蠣 +蠱 +蟶 +幘 +幗 +賕 +賑 +賒 +銠 +鉺 +鋏 +鐃 +銦 +鎧 +鍘 +銖 +銑 +鋌 +鏵 +銓 +鎩 +鉿 +銚 +鉻 +錚 +銫 +鉸 +銥 +銃 +銨 +銣 +鴰 +穠 +箋 +籩 +僨 +僂 +皚 +鴴 +艫 +龕 +玀 +獼 +餜 +餛 +鸞 +闍 +閾 +閹 +閶 +鬩 +閽 +閼 +羥 +糲 +燜 +漬 +瀆 +澠 +愜 +憚 +諶 +諫 +皸 +謔 +襠 +謁 +諤 +諭 +諼 +讒 +諳 +諦 +諞 +糶 +嬋 +綾 +騏 +綺 +緋 +緔 +騍 +緄 +騅 +綬 +綹 +綣 +綰 +驂 +緇 +靚 +輦 +黿 +頡 +撳 +蟄 +壪 +蔞 +櫝 +欏 +賫 +鵓 +鸝 +殫 +輥 +輞 +槧 +輟 +輜 +瞼 +躒 +蛺 +蟯 +螄 +蠐 +嘍 +嶸 +嶁 +賧 +鋙 +錸 +鏗 +鋥 +鋰 +鋯 +鋨 +銼 +鐧 +銻 +鋃 +鋦 +錒 +犢 +鵠 +篳 +牘 +儻 +儐 +儺 +嬃 +頜 +鵒 +魷 +魨 +魴 +潁 +颶 +觴 +熲 +餷 +餿 +褻 +臠 +癆 +癇 +賡 +頦 +鷳 +闌 +闃 +闋 +鵜 +憒 +嚳 +謨 +褳 +襇 +讜 +謖 +謚 +謐 +騭 +巰 +翬 +騖 +緙 +緗 +緘 +緹 +緲 +緦 +緱 +縋 +緡 +饗 +耮 +驁 +韞 +攄 +擯 +轂 +驀 +鶓 +薊 +蘺 +鎣 +頤 +櫚 +櫸 +磧 +磣 +鵪 +輳 +齟 +齙 +韙 +囁 +躂 +蹕 +躚 +躋 +噯 +鍺 +錛 +錡 +鍀 +錁 +錕 +錮 +鍁 +錈 +錠 +錙 +覦 +頷 +鮁 +鮃 +鮎 +鱸 +穌 +鮒 +鮐 +鵮 +颼 +饈 +鶉 +瘮 +闔 +闐 +闕 +灧 +瀅 +潷 +灤 +澦 +懾 +鱟 +騫 +竇 +謾 +謫 +嬡 +嬪 +縉 +縝 +縟 +轡 +騮 +縞 +縭 +縊 +縑 +騸 +覯 +韜 +靉 +攖 +薔 +藺 +鶘 +檳 +櫧 +釅 +殯 +霽 +轅 +齜 +齦 +瞜 +曖 +躊 +蟈 +鶚 +嚶 +羆 +賻 +罌 +鶻 +鍥 +鍇 +鍶 +鍔 +鍤 +鏘 +鎂 +鏤 +簀 +篋 +簞 +籙 +臏 +鮭 +鮪 +鱭 +鮫 +鱘 +饉 +鑾 +瘻 +闞 +鮝 +糝 +鷀 +瀲 +濰 +譖 +褸 +譙 +讕 +譎 +鶥 +嬙 +鶩 +驃 +縹 +縵 +縲 +纓 +驄 +繆 +繅 +耬 +瓔 +擷 +擼 +攛 +聵 +覲 +韃 +鞽 +蘄 +賾 +檣 +靨 +魘 +饜 +轆 +齬 +齪 +覷 +顒 +躓 +躑 +蠑 +螻 +顎 +嚕 +顓 +鑷 +鎘 +鎸 +鎳 +鎦 +鎰 +鎵 +鑌 +簣 +鷂 +鯁 +鱺 +鰱 +鰹 +鰣 +鯀 +鯇 +觶 +饊 +饌 +齏 +讞 +襤 +譫 +屨 +纈 +繕 +繒 +驏 +擻 +顳 +顢 +藪 +櫓 +櫞 +贋 +飆 +鏨 +轔 +蟎 +鐯 +鏢 +鏜 +鏝 +鏰 +鏞 +鏑 +鏃 +鏐 +氌 +穡 +魎 +鯪 +鯡 +鯤 +鯧 +鯝 +鯢 +鯛 +鯔 +獺 +鷓 +贇 +癭 +斕 +瀨 +顙 +繾 +繰 +繯 +蘚 +鷯 +齲 +齷 +躡 +蹣 +羈 +鐔 +鐝 +鐐 +鐓 +鑭 +鑹 +鏹 +鐙 +籪 +鷦 +鱝 +鰈 +鯷 +鰓 +鰍 +鰉 +鯿 +鷲 +懣 +鷸 +鰲 +韉 +顥 +鷺 +䴉 +髏 +鑊 +鐳 +鐲 +讎 +鰨 +鰥 +鰩 +癩 +攢 +靄 +躥 +髖 +髕 +鑔 +籟 +鰳 +鰾 +鱈 +鰻 +鱅 +讖 +驥 +纘 +瓚 +鼉 +黷 +黲 +鑣 +鑞 +臢 +鱖 +鱔 +鱒 +驤 +顰 +鱧 +癲 +灝 +鸛 +鑱 +趲 +顴 +躦 +饢 +戇 +戔 +訏 +訒 +釓 +俔 +閆 +澫 +訢 +訩 +詝 +紃 +纊 +瑒 +剗 +塸 +壢 +埨 +撝 +蔿 +榪 +軑 +軏 +咼 +㠣 +覎 +㑳 +颺 +閌 +潕 +湋 +澐 +浿 +諓 +禡 +詗 +詘 +詖 +屓 +彄 +紘 +馹 +馼 +紵 +紞 +駃 +紖 +瑲 +薴 +棡 +軝 +暐 +晛 +崬 +釴 +釤 +鍆 +鍚 +鄶 +獮 +飿 +嶨 +詷 +詪 +鄩 +鳲 +隑 +隮 +娙 +逕 +駓 +駔 +駉 +絅 +騶 +䮄 +紼 +紿 +瓅 +韍 +墶 +塏 +薘 +蕘 +蔄 +葒 +鳾 +龑 +軹 +軤 +轢 +軺 +睍 +曨 +噠 +鈃 +鈇 +鉅 +鋹 +釿 +錀 +鈧 +鈥 +鈄 +倈 +艤 +鶬 +颭 +餏 +湞 +溮 +滻 +褘 +絰 +駰 +絪 +駪 +綎 +綖 +驫 +勣 +璕 +𡑍 +䓣 +薟 +藭 +椏 +梜 +頍 +硜 +輄 +輈 +輇 +貲 +嗊 +曄 +暉 +鄳 +幬 +輋 +嶮 +贐 +鉥 +鉕 +鑪 +鉮 +鉊 +鉧 +僤 +鴒 +魛 +餗 +燖 +溳 +礐 +窵 +襏 +駼 +絺 +綌 +騂 +綄 +璡 +墠 +壼 +聹 +蘀 +勩 +罃 +檮 +棶 +厴 +䃮 +磑 +礄 +鴷 +齕 +頔 +廼 +凢 +亾 +枒 +屍 +匃 +匄 +紥 +紮 +疋 +殀 +讐 +觔 +兇 +宂 +㕥 +㠯 +栞 +佈 +佔 +呌 +敂 +冄 +坵 +僊 +怱 +悤 +冊 +夘 +戼 +牠 +妳 +嬭 +摃 +釦 +攷 +託 +衺 +衕 +弔 +喫 +囙 +㠶 +颿 +秊 +倣 +髣 +佀 +朶 +氷 +決 +併 +並 +竝 +汙 +汚 +異 +姦 +廵 +挵 +衖 +搤 +阯 +撦 +埳 +阬 +誌 +㕁 +卻 +刦 +刧 +刼 +芲 +蘤 +桿 +槓 +荳 +獃 +唫 +脗 +皁 +彿 +髴 +疘 +刪 +鉋 +鑤 +況 +牀 +恡 +棄 +洶 +汎 +災 +烖 +菑 +禩 +侷 +跼 +坿 +玅 +姉 +妬 +翫 +搨 +柺 +拕 +牴 +觝 +倖 +抝 +盃 +桮 +傑 +逩 +肎 +菓 +崐 +崑 +呪 +虖 +嘑 +謼 +詠 +㟁 +嵒 +巗 +巖 +雰 +稈 +咊 +嶽 +妷 +姪 +廹 +徃 +餚 +採 +寀 +唸 +週 +昬 +兎 +兔 +亯 +亱 +䘚 +淨 +劵 +匟 +㳒 +灋 +洩 +霑 +淚 +註 +恠 +箒 +屆 +絃 +圅 +旾 +珎 +掛 +垜 +艸 +茘 +査 +栢 +柵 +栁 +桺 +柹 +韮 +揹 +昰 +閧 +鬨 +冐 +暎 +嚥 +倃 +𠴰 +偺 +喒 +齩 +欬 +榘 +㑺 +儁 +敍 +敘 +肧 +脈 +䘑 +衇 +跡 +蹟 +砲 +礮 +薙 +鬀 +恆 +怳 +卹 +䘏 +賉 +婣 +畊 +揑 +綑 +輓 +恥 +躭 +晉 +棲 +覈 +慄 +翄 +脣 +槕 +㨪 +螡 +蟁 +㤙 +陗 +峩 +峯 +乗 +椉 +咲 +筍 +俛 +頫 +勌 +䠶 +躳 +慇 +拏 +㧱 +挐 +脃 +胷 +肐 +貍 +㽞 +畱 +淒 +悽 +蓆 +効 +傚 +涼 +缾 +菸 +煙 +淛 +湧 +誖 +猂 +醼 +讌 +㝠 +寃 +孃 +桒 +毬 +瑠 +璢 +瑯 +㨗 +搥 +搯 +蔆 +惏 +楳 +槑 +捄 +廂 +慽 +慼 +瞇 +埜 +畧 +虵 +稭 +棃 +犂 +迻 +媮 +兠 +舩 +慾 +綵 +腳 +𩓐 +夠 +豬 +貓 +湊 +減 +庻 +蔴 +菴 +朢 +睠 +觕 +麤 +釬 +銲 +痳 +殽 +婬 +滛 +湻 +㴱 +樑 +顇 +㝛 +窰 +窯 +琹 +欵 +墖 +趂 +隄 +愽 +揷 +揫 +煑 +朞 +㪚 +塟 +蔥 +蔕 +稜 +棊 +碁 +椶 +偪 +㕑 +廚 +廈 +鴈 +冣 +㝡 +晳 +鼃 +餧 +餵 +嗁 +諠 +㡌 +賸 +筴 +筞 +筩 +栰 +暠 +皜 +踰 +蝟 +㪟 +燄 +遊 +媿 +嘅 +庽 +窓 +牎 +牕 +窻 +徧 +僱 +帬 +裠 +強 +彊 +疎 +壻 +瓌 +䰟 +皷 +擕 +㩗 +㩦 +攜 +懃 +鞾 +幙 +㮣 +酧 +詶 +醻 +掽 +踫 +㼝 +盌 +磟 +覩 +倸 +㬉 +煗 +煖 +晻 +闇 +炤 +跥 +䗬 +蠭 +寘 +辠 +稺 +穉 +燬 +譭 +瘉 +癒 +顋 +骽 +猨 +蝯 +稟 +痺 +癡 +亷 +㢘 +韻 +泝 +遡 +昚 +躶 +臝 +羣 +㬪 +曡 +疊 +勦 +琍 +瓈 +𤋮 +熈 +牓 +搾 +謌 +堿 +鹻 +鹼 +矁 +燻 +髈 +𤺥 +辢 +旂 +𡚁 +潄 +砦 +詧 +嫰 +櫈 +撐 +墪 +譔 +鞵 +鞌 +蕋 +橤 +蘂 +醕 +譆 +跴 +蹤 +蜨 +蠍 +稾 +殭 +惪 +厀 +襃 +癅 +䊀 +餬 +潛 +癄 +顦 +鷰 +藷 +櫥 +螎 +蹏 +蟇 +譟 +簒 +彫 +琱 +鵰 +餹 +餻 +簷 +粦 +燐 +緐 +幑 +蹧 +粇 +穅 +臋 +籐 +繙 +飜 +孼 +蠏 +燿 +蝡 +稬 +穤 +惷 +覇 +鑵 +戹 +阨 +剳 +帀 +巵 +亙 +佇 +竚 +穽 +岅 +虯 +𦍑 +羗 +啎 +姙 +㘭 +袟 +袠 +逈 +㒺 +犛 +氂 +偘 +甕 +罋 +冺 +姍 +蝨 +琺 +瑇 +尅 +梔 +斮 +斲 +斵 +暱 +毘 +蝱 +吚 +哶 +峝 +粃 +竢 +狥 +秈 +烱 +㳄 +袵 +盇 +涖 +蒞 +碪 +蠔 +唕 +倐 +儵 +雋 +皐 +臯 +衂 +䶊 +臙 +獧 +痾 +皰 +湼 +澣 +濬 +塚 +襢 +娿 +勅 +勑 +戞 +廐 +廄 +眥 +覜 +勗 +啗 +噉 +傯 +挱 +㥫 +惥 +慂 +陻 +蕚 +萲 +蕿 +蘐 +藼 +櫂 +箠 +槨 +啑 +蹠 +蚘 +痐 +蛕 +蜖 +瘖 +遯 +醃 +飱 +冪 +簑 +枏 +柟 +檝 +楥 +矴 +椗 +嘷 +獋 +粺 +䈰 +諐 +齶 +堘 +疿 +雝 +秔 +稉 +槀 +搉 +廝 +叡 +嘠 +蜋 +筯 +篛 +麞 +糉 +緥 +璿 +髥 +臕 +餈 +剹 +橜 +罇 +蜺 +矙 +憇 +翺 +饍 +瞖 +羴 +羶 +爕 +繦 +騌 +鬉 +騣 +蔾 +䠀 +簮 +躕 +蹵 +䝔 +貛 +鼴 +麐 +塡 +あ +い +う +え +お +か +き +く +け +こ +さ +し +す +せ +そ +た +ち +つ +て +と +な +に +ぬ +ね +の +は +ひ +ふ +へ +ほ +ま +み +む +め +も +や +ゆ +よ +ら +り +る +れ +ろ +わ +を +ん +が +ぎ +ぐ +げ +ご +ざ +じ +ず +ぜ +ぞ +だ +ぢ +づ +で +ど +ば +び +ぶ +べ +ぼ +ぱ +ぴ +ぷ +ぺ +ぽ +ぁ +ぃ +ぅ +ぇ +ぉ +っ +ゃ +ゅ +ょ +ゎ +ゕ +ゖ +ア +イ +ウ +エ +オ +カ +キ +ク +ケ +コ +サ +シ +ス +セ +ソ +タ +チ +ツ +テ +ト +ナ +ニ +ヌ +ネ +ノ +ハ +ヒ +フ +ヘ +ホ +マ +ミ +ム +メ +モ +ヤ +ユ +ヨ +ラ +リ +ル +レ +ロ +ワ +ヲ +ン +ガ +ギ +グ +ゲ +ゴ +ザ +ジ +ズ +ゼ +ゾ +ダ +ヂ +ヅ +デ +ド +バ +ビ +ブ +ベ +ボ +パ +ピ +プ +ペ +ポ +ァ +ィ +ゥ +ェ +ォ +ッ +ャ +ュ +ョ +ヮ +ヵ +ヶ +ヷ +ヸ +ヹ +ヺ +・ +ー +ヽ +ヾ +ヿ +ア +イ +ウ +エ +オ +カ +キ +ク +ケ +コ +サ +シ +ス +セ +ソ +タ +チ +ツ +テ +ト +ナ +ニ +ヌ +ネ +ノ +ハ +ヒ +フ +ヘ +ホ +マ +ミ +ム +メ +モ +ヤ +ユ +ヨ +ラ +リ +ル +レ +ロ +ワ +ヲ +ン +゙ +゚ +ァ +ィ +ゥ +ェ +ォ +ッ +ャ +ュ +ョ +円 +気 +糸 +絵 +楽 +帰 +戸 +広 +黒 +図 +線 +読 +売 +歩 +毎 +亜 +悪 +圧 +扱 +囲 +為 +壱 +隠 +栄 +営 +駅 +塩 +縁 +艶 +応 +桜 +穏 +仮 +価 +箇 +ゑ +ゝ +ゞ +ヰ +ヴ +㈱ +両 +丼 +丿 +亀 +仏 +伝 +侶 +俤 +値 +倶 +倹 +偐 +偽 +働 +儛 +兌 +児 +冑 +冨 +凞 +処 +凪 +別 +剣 +剤 +剰 +劔 +労 +勧 +勲 +匁 +匂 +匲 +卍 +単 +厳 +収 +呂 +呉 +呑 +呰 +唖 +喚 +喩 +喰 +噛 +噺 +嚢 +囃 +団 +圀 +圏 +堀 +堺 +塀 +塁 +塙 +増 +墺 +壊 +壌 +壷 +変 +奨 +姫 +娯 +嫐 +嬢 +嬾 +孁 +宍 +実 +宮 +寔 +寛 +対 +専 +尭 +峠 +崋 +嶋 +巀 +巌 +巣 +巻 +帯 +幇 +庁 +廃 +廻 +弉 +弌 +弐 +弖 +弾 +従 +徳 +徴 +忯 +恵 +悩 +惣 +懐 +懽 +戦 +戯 +戻 +払 +抜 +択 +拝 +拠 +拡 +拵 +挙 +挿 +捗 +捜 +掟 +掲 +掻 +揃 +換 +揺 +摂 +撃 +撹 +斉 +斎 +旛 +旡 +晧 +晩 +暁 +暦 +曽 +杁 +杢 +杣 +杮 +枓 +枠 +枡 +柾 +栂 +栃 +桝 +桟 +桾 +梛 +梱 +梲 +梶 +椙 +検 +椥 +楕 +楡 +楢 +榊 +榎 +槇 +様 +槙 +槻 +樋 +権 +樫 +橿 +檥 +欅 +歎 +歓 +歯 +歳 +歴 +毀 +沖 +沢 +浄 +涙 +済 +渉 +渋 +渓 +渕 +満 +滝 +漑 +潅 +澁 +瀞 +瀬 +焔 +焼 +煇 +煕 +煥 +燗 +爼 +犠 +狛 +猟 +獏 +獣 +珊 +瑤 +甞 +畑 +畠 +畳 +畷 +畺 +痩 +癪 +発 +県 +眞 +砕 +碕 +礒 +禖 +禿 +稲 +穂 +穣 +竃 +竜 +竴 +笹 +筈 +筬 +筰 +箆 +箏 +箙 +篠 +篭 +簺 +籾 +粂 +粋 +粛 +粧 +糺 +紬 +絁 +経 +絖 +絣 +絽 +継 +続 +綟 +総 +縄 +縅 +縒 +縦 +繊 +繋 +繍 +繝 +繧 +纐 +纒 +罠 +罧 +罵 +羂 +羇 +羨 +聟 +聡 +聨 +聴 +脇 +脳 +膣 +膵 +臈 +臓 +臥 +舎 +舖 +舗 +舘 +芿 +苅 +茲 +荊 +荘 +莬 +莵 +菫 +萠 +蔵 +薗 +薫 +薬 +薭 +蘊 +蛍 +蝋 +蝿 +蟷 +衞 +衵 +袙 +袞 +袰 +袴 +袿 +裃 +裡 +裲 +褄 +褌 +襴 +襷 +覗 +覚 +覧 +観 +訳 +証 +諌 +諚 +諟 +諡 +諮 +譛 +譲 +讃 +豅 +豊 +豎 +賎 +賛 +贔 +躙 +躰 +転 +軽 +輌 +辥 +辺 +辻 +込 +逓 +遅 +遙 +邉 +郷 +酔 +醗 +醤 +醸 +釈 +鉄 +鉇 +鉤 +鉱 +鉾 +銈 +銕 +銭 +鋲 +鋳 +鋺 +錆 +錍 +錣 +錬 +錵 +鍑 +鍮 +鍼 +鎌 +鎗 +鎚 +鎹 +鐇 +鐚 +鐡 +鑁 +鑑 +鑚 +鑢 +閇 +関 +閦 +闘 +陥 +険 +隣 +隷 +雑 +雫 +霊 +靜 +靫 +靭 +靱 +鞄 +鞆 +頚 +頬 +頴 +頼 +顕 +顗 +餝 +饂 +駄 +駆 +駈 +騒 +験 +騨 +髄 +髙 +髪 +髷 +鯖 +鯰 +鯱 +鰒 +鰯 +鰰 +鳰 +鴎 +鴫 +鵄 +鵞 +鵺 +鶏 +鹸 +麁 +麺 +麿 +黌 +黙 +鼈 +齢 +龗 +縯 +蟅 +坖 +祂 +鼂 +鱚 +蛻 +屌 +呾 +煔 +吶 +扥 +蚖 +銂 +尃 +夋 +鵼 +徬 +寳 +彡 +舨 +湳 +麼 +鍈 +崈 +鱣 +盺 +拺 +瑥 +茷 +焻 +奀 +驎 +鱰 +砢 +痟 +廱 +僜 +瘺 +鱊 +擥 +嶰 +淓 +跅 +浵 +媗 +璦 +煠 +檊 +媃 +峅 +躄 +鉟 +塽 +蟴 +鯮 +弍 +烒 +鵵 +妑 +孋 +蚡 +恊 +輭 +廞 +產 +曅 +盜 +騤 +囪 +鱀 +茇 +葊 +逹 +狓 +崢 +趖 +凃 +羙 +鮸 +昞 +楿 +渽 +圗 +麪 +屇 +鍉 +葝 +沯 +爭 +幵 +筭 +寊 +銋 +貮 +鎭 +熺 +昜 +鍱 +墬 +愒 +磺 +嚈 +稘 +珮 +釆 +殑 +鍩 +䲁 +蕷 +鐿 +僡 +佹 +輶 +冴 +襶 +賔 +猙 +辧 +絛 +磾 +韁 +螔 +譳 +礑 +鋱 +魩 +嚗 +棆 +牆 +敟 +柶 +瓛 +魣 +巎 +轘 +襌 +枼 +鸌 +逺 +錏 +縡 +帢 +騄 +媼 +埅 +鄤 +萐 +祙 +旼 +詥 +鶲 +燉 +卲 +銱 +庲 +伱 +氽 +嵿 +挻 +煵 +窋 +鐤 +鮊 +鱬 +鰧 +嬤 +譞 +諲 +脭 +悳 +崘 +阭 +內 +袾 +冚 +壐 +咗 +礠 +孮 +痲 +埈 +肹 +鰮 +鮓 +濊 +塜 +凜 +蒢 +噰 +桼 +峍 +焴 +鶒 +鋮 +綠 +鶹 +熿 +毴 +咟 +嘥 +睺 +繡 +郎 +瘞 +鉶 +蔎 +秠 +緤 +蝀 +躝 +蟜 +繃 +囮 +墫 +乭 +胊 +濙 +瘓 +榣 +鑛 +鐫 +嶴 +甹 +坮 +銾 +蒭 +睜 +俋 +餠 +榢 +蓳 +盋 +堷 +鍏 +苝 +巛 +蚵 +暏 +熤 +嬨 +墎 +鏽 +戶 +菺 +膮 +熖 +睪 +栜 +捱 +榗 +鍷 +曧 +犽 +韑 +袓 +䖝 +焄 +喦 +髲 +疌 +㴪 +侊 +貐 +蕅 +禠 +蕑 +囯 +暊 +儞 +佋 +柎 +㐱 +鰤 +苳 +鱥 +謤 +遶 +眀 +鑀 +羋 +顏 +陜 +銩 +黶 +苼 +蒤 +棛 +儫 +咁 +抦 +衚 +棩 +焿 +脫 +麅 +玏 +埧 +淸 +黁 +淽 +彠 +鮨 +沜 +糀 +厓 +楧 +嶌 +簹 +檵 +鱇 +嶬 +廸 +卽 +樀 +贌 +酼 +籛 +沒 +晸 +諪 +蕡 +妏 +鄋 +蒍 +奧 +抇 +蓨 +薆 +鱷 +巘 +䝉 +亰 +寈 +槩 +誒 +麴 +蕟 +溎 +蘗 +榦 +斿 +暟 +炲 +拚 +娖 +繖 +橚 +寜 +爀 +饟 +悅 +鯏 +彜 +眾 +葯 +嬝 +埮 +獇 +馛 +溙 +瀦 +熼 +硓 +鈢 +樆 +輬 +鰜 +蔘 +渙 +澔 +嗮 +旉 +籜 +媊 +燘 +儚 +頹 +缽 +俽 +逨 +鱓 +郞 +歊 +杴 +珡 +杋 +醁 +鰏 +鵾 +鐽 +鮋 +巶 +荅 +薾 +囓 +蹻 +獎 +禑 +鎓 +榲 +僴 +綞 +尓 +敭 +曔 +褔 +鬅 +亊 +鏦 +蓘 +裬 +鱲 +薡 +鰗 +箑 +鬪 +縂 +璸 +甙 +茮 +辵 +岻 +覿 +滈 +鯶 +鑂 +囶 +舺 +溋 +拋 +菾 +敾 +虨 +綝 +蝍 +醂 +禨 +賹 +廧 +絕 +槗 +徫 +鎔 +曮 +蠂 +捒 +堈 +莕 +蓪 +敎 +禃 +櫱 +綧 +瀶 +逌 +浤 +碻 +刄 +逤 +剏 +氹 +菈 +娫 +蜛 +嵗 +糎 +螶 +譓 +鏳 +嵙 +瑊 +隲 +檨 +緈 +畵 +砯 +簗 +彅 +鰺 +騋 +窶 +嚒 +嵻 +尙 +頵 +槰 +虉 +醞 +巂 +彔 +偊 +畇 +鱨 +妸 +塲 +畐 +鈫 +錟 +磪 +摠 +彥 +璙 +囝 +寗 +耎 +鮡 +蘓 +弅 +焃 +飥 +戙 +塰 +儱 +槺 +噏 +魟 +禵 +佧 +咘 +盪 +瑈 +鉲 +睭 +鏌 +鼇 +郋 +魮 +朖 +滽 +渃 +滙 +熯 +醿 +鎅 +褀 +鬬 +巄 +螥 +眜 +釚 +柉 +壎 +峇 +姸 +唭 +鮜 +鈖 +嫈 +壄 +洤 +黃 +伕 +堦 +嶔 +鮰 +鞞 +漎 +鉓 +鮗 +壴 +阝 +妀 +矽 +獢 +倗 +銪 +鴓 +橒 +凈 +哖 +屚 +偍 +瑺 +媯 +淍 +驌 +椇 +赬 +薐 +糹 +碽 +濲 +釭 +晭 +纕 +寖 +閞 +歿 +呎 +鶆 +屄 +櫿 +犎 +旲 +㙟 +龎 +翜 +螾 +說 +衜 +泆 +軎 +鵂 +荎 +嚧 +硂 +桖 +褭 +筊 +鰷 +秳 +戩 +轀 +鬹 +飬 +卋 +暸 +狦 +搢 +娋 +鏴 +溫 +毉 +淰 +謩 +餺 +鵙 +鳽 +鮀 +狶 +氻 +轝 +妺 +袛 +蓭 +梂 +娛 +牼 +稅 +兿 +玾 +煚 +僩 +鶿 +鬄 +崠 +鉆 +鯓 +蚢 +庀 +鵟 +坣 +殼 +悞 +熅 +敻 +鍠 +曶 +愼 +搳 +姃 +砳 +槼 +臞 +韾 +靑 +鸊 +薲 +虛 +蠄 +啟 +鶺 +苺 +滾 +褞 +仺 +胇 +憻 +郳 +烉 +驩 +冇 +枖 +夌 +搵 +匸 +盨 +櫾 +霤 +麊 +貒 +噓 +嗢 +笩 +晈 +冂 +銳 +毿 +慜 +囧 +閜 +娸 +庢 +壆 +馯 +桱 +兗 +葃 +侅 +煐 +鐦 +藸 +鷎 +嵰 +逎 +弒 +匋 +鐭 +廔 +砩 +孆 +灴 +伷 +兪 +鴗 +澯 +幚 +旙 +勻 +礽 +婑 +鱮 +娍 +銶 +吳 +鍟 +仼 +鳧 +彞 +娽 +昛 +鰼 +剎 +佉 +鉏 +偸 +鰆 +讙 +橪 +啱 +岀 +孻 +釪 +乹 +鈳 +漇 +檦 +埻 +祿 +爌 +禇 +鱵 +㸃 +梉 +燝 +霙 +炁 +飮 +蠙 +勷 +鵎 +儥 +鐠 +唻 +廰 +嚿 +嵕 +墱 +紑 +搖 +瘜 +皝 +鸑 +瀁 +粵 +撚 +巑 +梀 +啯 +眛 +諴 +夊 +僙 +鍝 +裖 +鮣 +凬 +飡 +灊 +橓 +嫳 +筳 +咑 +粍 +瓑 +璌 +伃 +閰 +傜 +黐 +謢 +驒 +橫 +蛯 +寕 +蠵 +瞓 +旳 +翏 +硏 +寯 +韡 +楤 +鰃 +朿 +侞 +鵯 +愨 +祹 +厔 +丌 +盩 +謏 +魕 +啣 +閱 +曺 +枛 +罉 +卐 +樻 +鷉 +鯒 +鋡 +磱 +枱 +攴 +蠷 +穈 +嚟 +檽 +趐 +奐 +鋐 +檇 +薀 +峼 +咭 +訔 +韠 +鑴 +鸐 +唃 +捦 +鸜 +誴 +罳 +璄 +暃 +夀 +賨 +鞥 +鈊 +灡 +鮍 +懮 +籣 +昐 +陁 +襾 +鮠 +鈏 +囍 +婯 +艔 +貭 +䰾 +姁 +禼 +堖 +鋶 +仛 +鏷 +謜 +鑅 +忬 +蘶 +謠 +觙 +奫 +狟 +泩 +桙 +飈 +垰 +啍 +嚞 +鯕 +蒧 +榞 +徸 +璹 +揔 +欉 +魞 +菶 +玧 +鳯 +廍 +侚 +岰 +岧 +鋕 +凵 +彣 +崱 +媜 +倢 +鵐 +砋 +鷚 +鱠 +鮻 +繻 +摵 +贓 +磵 +錻 +痠 +粩 +胅 +奣 +塨 +瀠 +鸘 +啚 +娳 +霶 +壔 +峚 +甂 +廁 +覌 +鰂 +猳 +鱻 +盫 +裿 +杬 +歛 +澋 +蘞 +嵜 +尐 +旽 +鉌 +鎛 +豿 +凖 +榤 +禓 +龝 +悧 +鷟 +鮟 +吋 +喢 +岪 +吥 +漵 +頠 +豔 +巿 +鑨 +醣 +熳 +懍 +湥 +檡 +韺 +戱 +緖 +鐈 +凉 +緃 +鮹 +媐 +爯 +巆 +褍 +鐬 +昍 +扙 +鍳 +芛 +蟳 +嬅 +糬 +吔 +塭 +譿 +冧 +鏓 +嶪 +嗹 +椵 +姀 +閿 +褧 +錞 +玆 +笘 +篔 +萡 +鶡 +螐 +鮄 +鰟 +脷 +啲 +杤 +蓚 +尗 +娎 +殟 +淥 +蝚 +蓧 +彐 +嚤 +銍 +囒 +坶 +淩 +鶼 +鱂 +喼 +燫 +肏 +姵 +廌 +禟 +籝 +迵 +嵨 +堮 +蟌 +憍 +廕 +蜑 +緁 +唘 +竩 +崙 +璚 +粄 +栨 +罈 +梫 +貤 +藔 +蜯 +訁 +斖 +煶 +馦 +妠 +閟 +疕 +夆 +鎪 +膥 +澻 +嘢 +嚐 +靁 +鎻 +鰛 +穵 +烋 +縕 +褎 +疒 +壠 +溼 +圂 +咅 +鯭 +鯙 +磘 +玨 +珤 +朊 +蚼 +濶 +薞 +嚩 +丟 +嫺 +鯻 +椲 +鰕 +刂 +蠘 +踎 +瀴 +琁 +鰶 +瑴 +肜 +㐂 +欥 +媺 +竻 +讚 +𣇉 +裵 +緜 +廩 +齧 +叄 +俌 +厰 +滀 +錄 +鷫 +鯗 +攞 +姌 +蔝 +幷 +縤 +屻 +鯃 +雞 +纁 +嫲 +嵮 +屭 +嶃 +跩 +鋗 +蕢 +篊 +俬 +淎 +暻 +鏻 +憓 +玗 +溈 +笭 +糢 +勳 +閒 +沍 +咾 +鉷 +蘵 +俁 +崵 +毸 +苪 +掙 +鴡 +萭 +俴 +屜 +蒾 +艹 +剷 +慍 +朮 +枴 +氳 +猓 +甽 +箝 +譁 +贗 +迆 +鈽 +鍊 +鍰 +鏍 +靦 +餽 +丮 +丱 +仜 +仩 +伬 +伔 +仱 +伀 +伻 +佢 +佒 +侀 +侇 +佷 +佌 +佪 +侐 +侜 +俓 +侲 +俉 +侻 +侳 +俇 +倅 +倇 +倰 +倛 +倳 +倷 +俷 +倠 +偯 +偞 +偠 +偋 +偝 +偛 +偢 +偅 +偟 +偩 +偫 +傛 +傔 +傞 +傋 +傌 +傎 +傝 +偨 +傂 +傽 +傿 +僆 +傮 +僄 +僈 +傰 +僁 +傱 +僋 +僗 +僛 +僪 +僝 +僓 +僿 +儃 +儰 +僸 +僶 +僾 +儌 +僽 +儜 +儓 +儗 +儑 +儢 +儤 +儠 +儸 +儹 +儽 +冓 +冘 +冞 +凊 +凅 +凔 +刌 +刉 +刓 +刜 +刞 +刵 +刲 +剆 +刱 +剉 +剚 +剒 +剫 +剭 +剬 +剺 +剸 +剻 +剼 +劀 +劋 +劖 +劘 +劗 +劙 +劦 +勴 +匊 +匢 +匰 +匴 +匷 +匽 +卌 +卼 +厎 +厒 +厗 +厞 +厜 +厤 +厬 +厹 +吰 +吷 +吪 +呿 +咈 +呫 +呺 +呥 +呬 +呴 +茍 +咷 +咮 +咶 +哅 +咠 +咢 +唦 +唗 +唒 +哤 +唚 +唈 +哫 +唅 +唴 +啢 +唶 +啒 +啅 +唌 +唲 +喨 +喥 +喭 +噅 +喓 +喣 +啽 +喌 +嗃 +嗛 +嗋 +嗀 +喿 +喍 +嗏 +嗕 +嗈 +嘕 +嘒 +嗼 +嘐 +嘓 +嘂 +嗺 +嘝 +嘄 +嗿 +噈 +噊 +噆 +噚 +嘳 +嘽 +嘾 +噮 +噳 +噣 +噭 +噞 +嚌 +嚍 +嚃 +嚘 +嚜 +嚫 +嚪 +嚬 +嚲 +嚵 +嚽 +嚾 +囆 +囅 +囋 +囗 +圁 +圞 +圠 +坁 +坅 +坲 +坱 +垀 +坴 +垗 +垝 +垔 +垘 +垽 +垼 +埢 +埶 +堩 +堣 +塈 +堥 +塓 +塉 +塯 +塕 +塼 +墆 +塿 +塴 +墋 +塺 +墝 +墯 +壈 +墽 +壖 +壝 +壛 +壾 +壿 +夃 +夎 +夒 +夗 +奅 +奊 +奰 +奲 +奼 +妦 +妎 +妢 +妐 +妵 +姏 +姎 +㚷 +姡 +姺 +姼 +娭 +婐 +婟 +婥 +婓 +婗 +媔 +媟 +媢 +婸 +媦 +媥 +媬 +媕 +娷 +嫇 +嫋 +媰 +媻 +嫮 +嫥 +嫢 +嫛 +嫿 +嫴 +嫷 +嫶 +嬎 +嬓 +嬐 +嬲 +嬽 +孈 +屘 +孲 +孷 +宎 +宨 +寪 +寍 +寋 +寑 +寙 +寠 +寱 +尌 +尒 +尟 +尰 +尳 +屖 +屔 +屝 +屧 +屩 +屮 +屴 +岏 +岋 +岉 +岒 +岮 +岤 +岯 +岟 +岝 +峐 +峌 +峞 +峉 +峊 +峬 +峮 +峷 +崝 +崨 +崥 +崏 +崰 +崣 +崷 +嵃 +嵑 +崳 +崺 +嵂 +嵱 +嵣 +嵥 +嵞 +嶀 +嵽 +嶆 +嵺 +嵷 +嶊 +嶉 +嶈 +嵾 +嶕 +嶜 +嶡 +嶚 +嶞 +嶱 +嶩 +嶵 +嶭 +巃 +巏 +巕 +巟 +巹 +帊 +帗 +帟 +帣 +帠 +帤 +帩 +帾 +帴 +幏 +幎 +幓 +幩 +幝 +幠 +幧 +幨 +幦 +幭 +幰 +庂 +庉 +庌 +庈 +庰 +庛 +庣 +庨 +庮 +庪 +庬 +庴 +廅 +廇 +廘 +廗 +廎 +廜 +緳 +廦 +廥 +廮 +廯 +蠯 +廾 +弚 +弝 +弣 +弤 +弮 +弳 +彃 +彉 +彋 +彏 +彯 +彴 +彸 +彾 +徦 +徥 +徯 +徲 +徾 +徿 +忀 +忁 +忔 +忕 +忨 +忣 +忷 +忥 +怭 +怲 +怋 +怴 +怗 +怚 +怞 +怬 +怢 +怐 +怮 +怓 +怷 +怹 +恲 +恞 +恅 +恇 +恉 +恛 +恌 +恀 +恟 +悀 +悁 +悕 +悗 +悇 +悊 +悐 +悾 +悺 +惓 +惤 +惈 +悷 +惉 +悹 +惌 +惢 +惄 +愊 +愖 +愅 +惵 +愓 +惸 +惼 +惾 +慉 +慅 +愶 +愲 +愮 +愯 +愬 +慁 +慞 +慱 +慒 +慓 +慲 +憀 +慴 +慔 +慺 +慛 +憃 +慹 +憱 +憰 +憢 +憉 +憛 +憯 +憟 +憪 +憡 +憝 +憖 +懅 +憴 +懆 +懁 +憿 +憸 +憵 +憼 +懧 +懠 +懥 +懤 +懘 +懭 +懱 +懪 +懰 +懫 +懻 +戁 +戃 +戄 +戉 +戠 +酨 +戺 +扐 +扜 +扤 +扡 +扢 +抆 +抌 +抎 +抏 +扻 +抭 +抴 +拑 +抾 +抪 +抶 +抮 +挍 +挋 +挃 +拫 +拹 +挏 +挌 +拸 +挀 +拲 +捖 +挬 +挶 +揤 +捊 +挼 +挩 +捁 +挴 +捘 +捔 +捥 +掝 +掗 +掫 +掯 +捵 +掜 +捼 +掤 +掔 +掱 +揎 +揥 +揨 +揯 +揊 +揲 +揵 +摡 +揟 +揝 +揜 +揘 +揅 +揱 +搆 +搟 +搕 +搘 +搹 +搷 +搣 +搰 +搊 +搚 +摀 +搧 +搫 +摍 +摝 +摲 +摦 +摎 +摋 +摓 +摐 +摿 +摮 +摰 +撢 +撠 +撗 +撜 +撋 +撊 +撌 +撟 +擗 +擖 +擏 +擉 +撽 +擩 +擣 +擫 +擭 +擨 +擽 +擸 +攇 +攐 +攍 +攌 +攗 +攕 +攓 +攡 +攠 +攦 +攩 +攭 +攲 +攳 +敁 +敊 +敆 +敓 +敧 +敪 +敤 +敜 +敯 +敳 +敶 +敺 +敹 +敿 +斁 +斀 +斄 +斒 +斔 +斞 +斨 +斪 +斻 +旍 +旓 +旚 +旝 +旟 +昲 +昦 +昢 +晇 +晥 +晜 +晼 +晬 +暀 +暆 +暍 +暋 +暡 +暰 +暩 +曀 +曊 +曋 +曏 +曒 +曚 +曣 +曭 +朁 +朅 +朄 +朒 +朘 +朣 +朾 +朹 +朻 +朼 +杅 +杇 +杝 +杗 +枎 +杶 +枆 +枌 +柲 +枺 +枻 +柸 +柀 +柅 +柫 +柤 +柍 +柮 +柣 +柂 +柧 +栚 +桋 +桏 +栱 +栵 +栫 +栭 +栯 +栘 +栔 +梡 +梇 +梐 +桭 +梮 +楖 +梬 +梩 +桵 +梒 +椌 +椄 +棜 +棷 +棳 +棌 +椈 +楰 +棯 +椔 +棸 +楟 +楎 +楱 +楅 +楺 +楈 +楛 +楉 +楬 +椳 +楀 +楄 +楶 +楘 +榶 +槉 +榠 +榬 +榼 +榙 +榩 +榾 +榯 +槄 +榽 +榹 +槥 +槸 +樕 +樠 +槬 +槢 +樛 +樝 +槾 +樧 +槮 +樔 +槷 +橀 +樴 +橉 +橧 +樲 +橨 +橝 +橭 +橶 +樿 +橁 +檍 +檖 +檁 +檟 +橾 +檛 +檓 +檕 +檃 +櫅 +檹 +櫡 +櫠 +櫌 +櫑 +櫙 +櫋 +櫜 +櫐 +櫫 +櫬 +櫰 +櫹 +櫺 +櫼 +欃 +欋 +欈 +欐 +欑 +欘 +欨 +欴 +欯 +欭 +欱 +欶 +欳 +欷 +欿 +歂 +歈 +歍 +歋 +歕 +歔 +歜 +歠 +歭 +歾 +肂 +殈 +殏 +殔 +殗 +殙 +殠 +殥 +殢 +殦 +殧 +殰 +殶 +毃 +毄 +毈 +毇 +毊 +毚 +毞 +毦 +毤 +毨 +毣 +毰 +毲 +毻 +毼 +毾 +氁 +氀 +氄 +氠 +氶 +汃 +汒 +汏 +汍 +汸 +沋 +汱 +汯 +沕 +汦 +汳 +泬 +沶 +沬 +泧 +沷 +泭 +泲 +泒 +沴 +洟 +洊 +洀 +浺 +浶 +洍 +涒 +浘 +浢 +涊 +涆 +浧 +涗 +涳 +涬 +淢 +涷 +淔 +渀 +淈 +涾 +淊 +涽 +淭 +湆 +湇 +湅 +湢 +渿 +湁 +渜 +渳 +湀 +渻 +渮 +湨 +湡 +渱 +渨 +湠 +湱 +湩 +渹 +溛 +滖 +溓 +溔 +滒 +溰 +溾 +滜 +滵 +滱 +漃 +漥 +漮 +潎 +漙 +漧 +漘 +漒 +滭 +漊 +潳 +滮 +潀 +漰 +潃 +漅 +濆 +澒 +澅 +潚 +潠 +澖 +潶 +潬 +潒 +潐 +潗 +澓 +潝 +濇 +濎 +濈 +濄 +澞 +澨 +瀄 +濌 +澩 +濴 +濔 +濣 +濭 +濧 +濦 +瀇 +瀎 +濿 +瀀 +濻 +瀙 +瀖 +瀫 +瀡 +瀢 +瀩 +瀯 +瀷 +灂 +瀸 +瀿 +瀺 +灄 +灉 +灖 +灗 +灛 +灟 +灨 +灩 +灪 +炾 +炰 +烓 +烑 +缹 +焍 +烰 +焠 +焮 +焣 +煆 +煣 +煝 +熐 +熉 +熀 +熂 +熚 +燅 +燂 +熸 +燀 +燡 +爁 +爊 +爂 +爓 +爞 +爢 +爣 +牄 +牉 +牋 +牏 +牣 +牬 +牰 +牸 +牷 +犈 +犉 +犆 +犅 +犌 +犑 +犐 +犗 +犕 +犓 +犘 +犚 +犝 +犞 +犥 +犦 +犤 +犣 +犩 +犪 +犮 +犵 +犿 +狆 +狖 +狋 +狘 +狜 +狔 +狚 +狌 +狑 +狊 +狤 +狫 +狪 +狣 +猀 +狾 +猑 +猘 +猈 +狿 +猏 +猋 +猒 +猧 +猲 +猭 +猦 +猣 +猵 +猼 +獂 +獀 +獊 +獑 +獌 +獘 +獞 +獟 +獝 +獛 +獡 +獩 +獦 +獥 +獳 +獶 +獽 +獿 +玂 +玁 +玈 +玊 +玔 +珓 +珶 +琖 +瑵 +璊 +瑽 +璅 +瑿 +璗 +瓁 +瓋 +瓝 +瓟 +瓡 +瓥 +瓨 +瓬 +瓵 +瓾 +瓽 +甀 +甃 +甈 +甋 +甐 +甒 +甔 +甖 +甝 +甮 +甿 +畟 +畣 +畽 +疀 +疧 +痁 +疻 +痀 +痎 +痏 +痋 +痌 +痑 +痚 +痡 +痝 +痗 +痯 +瘏 +痷 +痸 +痻 +瘈 +瘑 +瘝 +瘣 +瘯 +瘱 +瘽 +癈 +癉 +癙 +癐 +癓 +癠 +癵 +癹 +皊 +皏 +皫 +皯 +皵 +皻 +皽 +皾 +盄 +盓 +盝 +盬 +盭 +盳 +眃 +眅 +盻 +眝 +眐 +眓 +眒 +眣 +眑 +眕 +眹 +眱 +眲 +眴 +眳 +眽 +睆 +睅 +睊 +睋 +睌 +睕 +睟 +睒 +睖 +睩 +睧 +睔 +瞁 +睼 +瞂 +睮 +睯 +瞏 +瞉 +瞚 +瞝 +瞡 +瞛 +瞲 +瞷 +瞶 +瞴 +矂 +矉 +矊 +矌 +矎 +矏 +矐 +矔 +矕 +矘 +矠 +矱 +矲 +矹 +矺 +砅 +砐 +砏 +砎 +砨 +硈 +硉 +硠 +硥 +硱 +硰 +硩 +碔 +碄 +碅 +碆 +硾 +碫 +碞 +磍 +磌 +磎 +磈 +磃 +磝 +磩 +磥 +磞 +磛 +磳 +磼 +磿 +礔 +礉 +礝 +礛 +礜 +礥 +礣 +礧 +礨 +礭 +礿 +祌 +祅 +祔 +祒 +祑 +祤 +祩 +祪 +祣 +祫 +祡 +祴 +祳 +禂 +禗 +禜 +禫 +禭 +禬 +禴 +禷 +禸 +歶 +秅 +秏 +秖 +秎 +秮 +秪 +秺 +秶 +稊 +稒 +稫 +穊 +稰 +稯 +穋 +穛 +穖 +穧 +穨 +穮 +穬 +穭 +穱 +穾 +窆 +窉 +窌 +窏 +窔 +窐 +窙 +窢 +窞 +窫 +窲 +窴 +窱 +窾 +竀 +竁 +竷 +笐 +笓 +笅 +笵 +笻 +笴 +笰 +笢 +笝 +笲 +筄 +筡 +箈 +箊 +箌 +箛 +箎 +箘 +箄 +箷 +箾 +篎 +箯 +箹 +篞 +篣 +篧 +篕 +篨 +篹 +簅 +篲 +篿 +篻 +簎 +篴 +簂 +簁 +篸 +篽 +簜 +簩 +簙 +簭 +簦 +簨 +簢 +簥 +簳 +簼 +簬 +簻 +籉 +籈 +籊 +籔 +籗 +籧 +籦 +籯 +籺 +籸 +籹 +粊 +粔 +粻 +糔 +糪 +糱 +糷 +紎 +紟 +紒 +紽 +紸 +紶 +紩 +絇 +紾 +絘 +絯 +絓 +絧 +絏 +絭 +絫 +綀 +綍 +絿 +綅 +絻 +絼 +綔 +綷 +緂 +綪 +緀 +緅 +緎 +緆 +緌 +綯 +綼 +緷 +緛 +緪 +緧 +縃 +緺 +緶 +緰 +縗 +縌 +縓 +縎 +縜 +縚 +縏 +縼 +繂 +縳 +顈 +繈 +縸 +縪 +繉 +繀 +縩 +緵 +縰 +縿 +縶 +繜 +繐 +繣 +繘 +繢 +繟 +繑 +繠 +繶 +繵 +繸 +繷 +繺 +繲 +繴 +纀 +纇 +纋 +纆 +纑 +纗 +纚 +缿 +罊 +罏 +罜 +罞 +罝 +罛 +罣 +罥 +罦 +罭 +罫 +罬 +罻 +罼 +罺 +罿 +羃 +羉 +羍 +羒 +羜 +羛 +羢 +羠 +羦 +羬 +羭 +羵 +羳 +羷 +羺 +羾 +翋 +翍 +翐 +翑 +翇 +翢 +翣 +翭 +翪 +翨 +翴 +翲 +翽 +翿 +耟 +耞 +耡 +耴 +耾 +耹 +聇 +聈 +聑 +聏 +聝 +肕 +肙 +肒 +肣 +肵 +胘 +胑 +胐 +胕 +胉 +胏 +胹 +胵 +脁 +胻 +脀 +胾 +胔 +脰 +脥 +脤 +脙 +脡 +脕 +脧 +腃 +腏 +腄 +腇 +脽 +腍 +腤 +腷 +腜 +腛 +腢 +腲 +朡 +腞 +腶 +膉 +膆 +膃 +膇 +膍 +膌 +膋 +膟 +膕 +膢 +膱 +膹 +膫 +膰 +膬 +膴 +膲 +臇 +膷 +臄 +臅 +臒 +臐 +臗 +臛 +臡 +臦 +臩 +臮 +臲 +臷 +臸 +臿 +舋 +舑 +舕 +舝 +舡 +舼 +舽 +艀 +艂 +艓 +艒 +艐 +艑 +艕 +艛 +艵 +艼 +芀 +芐 +芅 +芓 +芔 +苀 +芚 +芵 +芧 +芞 +芺 +苙 +苨 +苖 +苬 +苲 +苵 +苶 +茙 +茥 +茿 +茦 +茢 +荂 +茪 +荍 +茖 +茤 +茠 +茩 +茻 +莐 +莣 +莍 +荺 +莤 +荴 +莏 +莁 +荵 +莔 +莃 +莌 +莋 +荾 +莥 +菨 +萒 +菧 +菤 +菆 +菣 +菿 +菋 +菎 +菵 +萉 +菞 +菳 +菕 +蓱 +萿 +葹 +葥 +葀 +葧 +萰 +葍 +葽 +蔇 +葞 +萷 +萺 +萴 +葅 +菙 +葋 +萯 +葂 +葟 +葌 +蓎 +蒬 +蒮 +蒫 +蒪 +蒚 +蒝 +蓌 +蒛 +蒩 +蒘 +蒶 +蒠 +蔤 +蔏 +蔩 +蔉 +蔍 +蔧 +蔜 +蓻 +蓺 +蓴 +蔪 +蓲 +蓷 +蓫 +蔒 +蓩 +蔖 +蓾 +蔨 +蔮 +蔂 +蓶 +蔱 +蓹 +蔠 +蔰 +蕫 +蕍 +蕀 +蕆 +蕄 +蕇 +蕣 +蕛 +蕱 +蕵 +蕮 +蕧 +蕠 +蕦 +蕝 +薃 +薧 +薕 +薠 +薋 +薣 +薚 +蕼 +薉 +蕸 +薎 +薖 +薍 +薝 +薂 +藆 +藀 +藃 +藂 +薵 +薽 +藇 +藄 +藋 +藈 +藅 +薱 +薶 +藒 +藫 +藱 +藙 +藡 +藚 +藗 +藲 +藬 +藘 +藣 +藑 +藰 +蘁 +藾 +蘛 +蘉 +蘌 +蘪 +蘦 +蘟 +蘣 +蘜 +蘙 +蘮 +蘡 +蘠 +蘥 +蘴 +蘳 +蘬 +虀 +蘹 +蘱 +蘻 +蘾 +虃 +虆 +虇 +虈 +虌 +虋 +虙 +虡 +虣 +虩 +虪 +虰 +虭 +虴 +蚑 +蚞 +蚇 +蚗 +蚚 +蚅 +蚥 +蚙 +蚿 +蚷 +蛂 +蛁 +蛅 +蛈 +蚹 +蚳 +蚸 +蛌 +蚻 +蛢 +蛦 +蛓 +蛣 +蛚 +蛪 +蛝 +蛫 +蛜 +蛬 +蛗 +蜄 +蛷 +蜌 +蛖 +蛵 +蜁 +蛶 +蜳 +蝫 +蜙 +蝃 +蜬 +蝁 +蝆 +蜠 +蜲 +蜪 +蜭 +蜼 +蜵 +蝂 +蜦 +蜧 +蜸 +蜤 +蜰 +蝖 +蝷 +蟡 +蝳 +蝔 +蝛 +蝒 +蝑 +蝞 +蝭 +蝪 +蝐 +蝝 +蝬 +蝺 +蝜 +螛 +螏 +螓 +螒 +螁 +螖 +螘 +蝹 +螇 +螑 +螝 +螜 +螚 +螪 +螰 +螹 +螼 +螮 +蟉 +蟃 +蟂 +螷 +螴 +螿 +螸 +蟞 +蟧 +蟦 +蟢 +蟟 +蟤 +蟔 +蟓 +蟭 +蟘 +螤 +蟗 +蟙 +蠁 +蟨 +蠀 +蟺 +蠉 +蠌 +蟼 +蠈 +蟿 +蠗 +蠩 +蠝 +蠛 +蠠 +蠤 +蠜 +蠫 +蠬 +蠨 +蠦 +蠪 +蠥 +蠰 +蠮 +蠳 +蠸 +蠾 +蠽 +蠿 +衁 +衈 +衋 +衧 +衪 +衭 +衶 +袀 +衱 +衯 +袃 +袉 +袕 +袨 +袚 +袑 +袡 +袘 +袧 +袬 +袌 +袺 +裗 +袹 +袸 +裀 +袶 +袽 +袲 +裋 +裍 +裞 +裚 +裷 +裧 +裺 +裮 +裶 +裯 +裻 +褁 +褅 +褋 +褗 +褆 +褖 +褑 +褦 +褮 +褱 +褢 +褩 +褵 +褼 +褾 +襒 +褷 +襂 +褽 +襓 +襋 +襆 +襐 +襛 +襗 +襡 +襘 +襝 +襣 +襭 +襩 +襮 +襳 +襹 +襺 +覂 +覅 +覕 +覛 +覝 +覢 +覤 +覣 +覭 +覮 +覶 +觓 +觤 +觡 +觠 +觢 +觩 +觰 +觬 +觲 +觷 +觺 +觻 +觼 +觾 +訑 +訰 +訧 +訬 +訞 +詍 +訹 +詙 +詀 +詄 +詅 +訿 +誂 +詻 +誃 +誫 +誙 +誋 +諆 +誸 +諔 +諕 +誻 +諀 +諅 +諵 +諝 +諰 +諈 +謞 +謘 +謑 +謋 +謒 +謕 +謍 +謈 +謪 +謧 +謣 +謰 +謵 +譇 +謯 +謱 +謥 +謷 +謦 +譐 +譈 +譊 +譀 +譋 +譕 +譑 +譠 +譪 +譝 +譨 +譣 +譥 +譹 +譸 +譅 +譺 +譻 +譾 +讄 +讂 +讆 +讋 +讔 +讘 +讟 +谹 +谻 +谽 +谾 +豃 +豋 +豍 +豏 +豗 +豜 +豝 +豟 +豥 +豤 +豦 +豭 +豰 +豲 +豱 +豯 +豵 +豷 +豶 +豻 +豽 +貁 +貀 +貄 +貏 +貑 +貕 +貙 +貗 +貜 +貣 +貾 +賌 +賥 +賟 +賙 +賵 +賮 +贆 +贕 +贙 +赨 +赩 +赮 +赸 +趀 +趌 +趎 +趏 +趍 +趓 +趠 +趜 +趡 +趥 +趧 +趬 +趪 +趭 +趫 +趮 +趷 +趹 +跘 +跓 +跍 +跇 +跜 +跕 +跙 +跈 +跰 +跠 +跮 +跦 +跢 +跧 +跲 +跫 +踂 +跿 +踍 +踃 +踇 +踆 +跾 +踠 +踥 +踤 +踡 +踕 +踛 +踖 +踑 +踙 +踧 +踘 +踓 +踳 +踾 +踸 +踼 +蹎 +蹍 +蹓 +蹗 +蹖 +蹞 +蹥 +蹛 +蹡 +蹝 +蹔 +蹸 +蹳 +蹪 +躆 +躈 +躖 +躗 +躟 +躠 +躤 +躣 +躩 +躨 +躽 +軓 +軘 +軞 +軯 +軷 +軦 +軮 +軥 +軵 +軧 +軨 +軶 +軱 +軬 +輆 +軿 +輁 +輀 +輂 +輐 +輑 +輤 +輘 +輚 +輠 +輣 +輖 +輗 +輮 +輵 +輲 +輹 +輷 +輴 +轃 +轇 +轈 +轒 +轑 +轏 +轐 +轓 +轙 +轖 +轗 +轕 +轚 +轞 +轛 +轠 +辴 +迉 +迒 +迋 +迍 +迖 +迣 +迡 +迾 +迿 +逜 +逿 +遝 +遳 +遰 +遻 +邆 +邅 +遾 +邍 +邔 +邟 +邥 +邞 +邧 +郱 +郕 +郖 +郠 +郙 +郣 +郥 +郘 +郰 +郲 +郔 +鄬 +郼 +鄈 +郹 +郻 +鄁 +鄇 +郺 +鄐 +鄍 +鄏 +鄎 +鄟 +鄝 +鄡 +鄛 +鄨 +鄪 +鄦 +鄮 +鄵 +鄸 +鄻 +鄾 +酀 +酁 +酄 +酇 +酖 +酘 +酓 +酟 +酳 +醆 +醊 +醓 +醙 +醟 +醥 +醧 +醰 +醱 +醷 +醲 +醳 +醹 +醽 +釂 +釃 +釢 +釱 +釳 +釸 +鈚 +鈌 +鈒 +釽 +鈆 +鉒 +鉠 +鉯 +鈶 +鉼 +銤 +銛 +銔 +鉹 +銗 +鋄 +鋀 +鋟 +鋘 +鋩 +鋝 +鋂 +鋊 +錧 +錼 +錭 +錎 +鋋 +鎡 +鎃 +鎯 +鍖 +鍜 +鍐 +鍭 +鍌 +鎒 +鎷 +鎝 +鎉 +鎎 +鎞 +鏏 +鏂 +鏚 +鏬 +鏙 +鐋 +鐏 +鏾 +鐕 +鐨 +鐍 +鐀 +鐎 +鐖 +鐻 +鐶 +鑐 +鑋 +鑕 +鑮 +鑯 +钂 +钀 +钁 +钃 +镺 +镻 +镼 +镽 +閈 +閍 +閺 +閵 +闀 +闉 +闅 +閷 +闒 +闑 +闚 +闛 +闠 +闟 +闤 +阞 +阢 +阤 +阠 +阰 +阹 +阸 +阺 +陏 +陓 +陊 +陼 +陭 +陫 +隇 +陾 +隉 +隒 +隓 +隞 +隤 +隿 +雂 +雈 +雓 +雔 +雗 +雚 +雟 +雘 +雺 +雽 +雿 +霂 +霋 +霒 +霐 +霠 +霣 +霢 +霩 +霫 +霬 +霮 +霵 +霿 +靆 +靃 +靪 +靮 +靷 +靲 +靾 +鞃 +鞀 +鞂 +靻 +鞊 +鞎 +鞈 +鞙 +鞗 +鞚 +鞜 +鞤 +鞪 +鞷 +鞶 +鞹 +鞻 +鞿 +韄 +韅 +韇 +韎 +韐 +韏 +韕 +韔 +韗 +韝 +韟 +韣 +韥 +韰 +韱 +韹 +韽 +頄 +頖 +頞 +頝 +頩 +頨 +頯 +頲 +顁 +顄 +顊 +顉 +顅 +顐 +顑 +顜 +顝 +顠 +顣 +顟 +顤 +顪 +顩 +顲 +颬 +颲 +颸 +颽 +颻 +颾 +飁 +飂 +飉 +飋 +飌 +飣 +飶 +餂 +餀 +飺 +餔 +餖 +餕 +餤 +餟 +餥 +餫 +餪 +餲 +餯 +餭 +餱 +餰 +饁 +饇 +饐 +饎 +饙 +饘 +饛 +饡 +馣 +馲 +馰 +馵 +馻 +馺 +駂 +馽 +駜 +駍 +駏 +駎 +駖 +駮 +駬 +駥 +駤 +駣 +駩 +駺 +駴 +駷 +駹 +駶 +駻 +駽 +駾 +騃 +騉 +騑 +騊 +騇 +騚 +騕 +騥 +騝 +騛 +騢 +騠 +騧 +騞 +騜 +騵 +騲 +騴 +騱 +騬 +騪 +騩 +騹 +騽 +驆 +騺 +驓 +驔 +驈 +驉 +驖 +驞 +驠 +驦 +驨 +骭 +骫 +骹 +骿 +骴 +骾 +髇 +髊 +髆 +髍 +髐 +髟 +髧 +髬 +髳 +髶 +髺 +髾 +鬁 +髼 +鬋 +鬊 +鬎 +鬌 +鬐 +鬕 +鬗 +鬖 +鬙 +鬞 +鬠 +鬤 +鬫 +鬳 +鬵 +鬺 +鬾 +鬿 +魊 +魌 +魖 +魠 +魡 +魧 +魱 +魦 +魶 +魵 +鮅 +鮇 +魼 +魾 +魻 +鮂 +鮚 +鮞 +鮛 +鮦 +鮥 +鮤 +鮆 +鯆 +鮿 +鮵 +鯈 +鯫 +鯠 +鯞 +鯦 +鯬 +鰌 +鰋 +鰅 +鯸 +鰫 +鰝 +鰬 +鱆 +鰿 +鱄 +鱁 +鰴 +鱐 +鱍 +鱋 +鱕 +鱦 +鱢 +鱞 +鱴 +鱳 +鱹 +鳦 +鳪 +鳭 +鳱 +鳵 +鳼 +鳺 +鳿 +鳷 +鴀 +鳹 +鳻 +鴅 +鴃 +鴥 +鴠 +鴔 +鴩 +鴘 +鴢 +鴐 +鴳 +鵁 +鵧 +鴶 +鴮 +鴱 +鴸 +鵅 +鵃 +鴾 +鵀 +鴽 +鵏 +鵊 +鵛 +鵋 +鵖 +鵌 +鵗 +鵔 +鵷 +鶁 +鶊 +鶄 +鶈 +鵱 +鶀 +鵸 +鶋 +鶌 +鵽 +鵫 +鵴 +鵩 +鶅 +鵳 +鵻 +鶂 +鵹 +鶟 +鶙 +鶤 +鶝 +鶐 +鶛 +鶠 +鶔 +鶜 +鶪 +鶗 +鶢 +鶨 +鶞 +鶣 +鶖 +鶷 +鶶 +鷁 +鷇 +鷊 +鷏 +鶾 +鷅 +鷃 +鶵 +鷈 +鶱 +鶭 +鷛 +鷒 +鷞 +鷋 +鷐 +鷜 +鷑 +鷩 +鷘 +鷖 +鷵 +鷕 +鷻 +鷷 +鷣 +鷤 +鷶 +鷡 +鷮 +鷢 +鸂 +鷾 +鸇 +鸃 +鸆 +鸅 +鸀 +鸁 +鸉 +鷿 +鷽 +鸄 +鸋 +鸍 +鸏 +鸒 +鸔 +鸓 +鸗 +鸙 +鹺 +麃 +麆 +麉 +麎 +麌 +麔 +麙 +麛 +麚 +麜 +麠 +麡 +麧 +麮 +麰 +麶 +麷 +黀 +黂 +黈 +黓 +黕 +黖 +黚 +黤 +黫 +黮 +黭 +黰 +黳 +黵 +黺 +鼁 +鼀 +鼆 +鼊 +鼏 +鼖 +鼛 +鼘 +鼜 +鼤 +鼣 +鼥 +鼪 +鼨 +鼭 +鼰 +鼮 +鼵 +鼳 +鼲 +鼸 +鼶 +齀 +齂 +齃 +齌 +齍 +齎 +齖 +齗 +齘 +齛 +齠 +齞 +齝 +齥 +齤 +齫 +齱 +齰 +齮 +齯 +齴 +齵 +齸 +齻 +齺 +齹 +齾 +龒 +龤 +堔 +礂 +蒏 +蒆 +兙 +兛 +兞 +兝 +兡 +兣 +嗧 +瓩 +忼 +擡 +氊 +穇 +擧 +譌 +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +© +° +² +´ +½ +Á +Ä +Å +Ç +È +É +Í +Ó +Ö +× +Ü +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +í +ð +ñ +ò +ó +ô +õ +ö +ø +ú +û +ü +ý +ā +ă +ą +ć +Č +č +đ +ē +ė +ę +ğ +ī +ı +Ł +ł +ń +ň +ō +ř +Ş +ş +Š +š +ţ +ū +ż +Ž +ž +Ș +ș +ț +Δ +α +λ +μ +φ +Г +О +а +в +л +о +р +с +т +я +ồ +— +― +’ +“ +” +… +℃ +→ +∇ +− +■ +☆ +、 +。 +々 +〆 +〈 +〉 +「 +」 +『 +』 +〔 +〕 +〜 +! +# +% +& +( +) ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +R +S +T +U +V +W +X +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +~ +・ +ǎ +ǒ +ě +ǐ +ì +ǔ +ù +ǖ +ǘ +ǚ +ǜ +【 +】 +《 +》 +‥ +{ +} +\ +| +@ +^ +~ +÷ +∕ +∙ +⋅ +· +⊕ +⊖ +⊗ +⊘ +⊙ +± +∓ +∩ +∪ +□ +⊎ +⊓ +⊔ +≠ +≈ +≡ +≤ +≥ +≪ +≫ +≲ +≳ +≶ +≷ +≺ +≻ +≼ +≽ +∈ +∉ +⊂ +⊃ +⊆ +⊇ +⊄ +⊅ +∅ +∖ +∁ +∆ +∧ +∨ +¬ +⊻ +⊼ +⊽ +← +↔ +⇒ +⇐ +⇔ +∀ +∃ +∄ +∴ +∵ +∝ +∞ +⊥ +∟ +∠ +∡ +∢ +′ +″ +∥ +⊾ +⊿ +∂ +∫ +∬ +∭ +∮ +∯ +∰ +∑ +∏ +√ +∛ +∜ +∱ +∲ +∳ +∶ +∷ +∼ +® +≄ +≅ +≃ +≦ +≧ +⊈ +⊉ +⊢ +⊤ +⊨ +⊧ +℉ +Ω +℧ +Å +⌀ +ℏ +⅀ +⍺ +⍵ +¢ +€ +£ +¥ +¥ +₿ +↑ +↓ +↕ +↖ +↗ +↘ +↙ +↺ +↻ +↼ +↽ +↾ +↿ +⇀ +⇁ +⇂ +⇃ +⇋ +⇌ +ª +º +⁰ +¹ +³ +⁴ +⁵ +⁶ +⁷ +⁸ +⁹ +⁺ +⁻ +⁼ +⁽ +⁾ +ⁿ +₀ +₁ +₂ +₃ +₄ +₅ +₆ +₇ +₈ +₉ +₊ +₋ +₌ +₍ +₎ +Ⅰ +Ⅱ +Ⅲ +Ⅳ +Ⅴ +Ⅵ +Ⅶ +Ⅷ +Ⅸ +Ⅹ +Ⅺ +Ⅻ +ⅰ +ⅱ +ⅲ +ⅳ +ⅴ +ⅵ +ⅶ +ⅷ +ⅸ +ⅹ +ⅺ +ⅻ +☰ +☱ +☲ +☳ +☴ +☵ +☶ +☷ +♀ +♂ +♳ +♴ +♵ +♶ +♷ +♸ +♹ +♺ +♩ +♪ +♫ +♬ +⚪ +⚫ +⚬ +✶ +✷ +✸ +➀ +➁ +➂ +➃ +➄ +➅ +➆ +➇ +➈ +➉ +➊ +➋ +➌ +➍ +➎ +➏ +➐ +➑ +➒ +➓ +⏀ +⏁ +⏂ +⏃ +⏄ +⏅ +⏆ +⏇ +⏈ +⏉ +⏊ +⏋ +⏌ +⏚ +⏴ +⏵ +⏶ +⏷ +⏸ +⏹ +⏺ +⏻ +⏼ +Α +Β +Γ +Ε +Ζ +Η +Θ +Ι +Κ +Λ +Μ +Ν +Ξ +Ο +Π +Ρ +Σ +Τ +Υ +Φ +Χ +Ψ +β +γ +δ +ε +ζ +η +θ +ι +κ +ν +ξ +ο +π +ρ +σ +τ +υ +χ +ψ +ω +ϐ +ϑ +ϒ +ϕ +█ +ϖ +ϰ +ϱ +ϴ +ϵ +ϝ +Ϟ +ϟ +Ϡ +ϡ +Ϣ +ϣ +Ϥ +ϥ +Ϧ +ϧ +Ϩ +ϩ +Ϫ +ϫ +Ϭ +ϭ +Ϯ +ϯ +∸ +∹ +∺ +∻ +∽ +∾ +∿ +≀ +≁ +≂ +≆ +≇ +≉ +≊ +≋ +≌ +≍ +≎ +≏ +≐ +≑ +≒ +≓ +≔ +≕ +≖ +≗ +≘ +≙ +≚ +≛ +≜ +≝ +≞ +≟ +≢ +≣ +≨ +≩ +≬ +≭ +≮ +≯ +≰ +≱ +≴ +≵ +≸ +≹ +≾ +≿ +⊀ +⊁ +⊊ +⊋ +⊌ +⊍ +⊏ +⊐ +⊑ +⊒ +⊚ +⊛ +⊜ +⊝ +⊞ +⊟ +⊠ +⊡ +⊣ +⊦ +⊩ +⊪ +⊫ +⊬ +⊭ +⊮ +⊯ +⊰ +⊱ +⊲ +⊳ +⊴ +⊵ +⊶ +⊷ +⊸ +⊹ +⊺ +ℎ +℘ +ℜ +ℑ +ℵ +ℶ +ℷ +ℸ +⌬ +⌭ +⌮ +⌯ +⎔ +¤ +₠ +₡ +₢ +₣ +₤ +₥ +₦ +₧ +₨ +₩ +₪ +₫ +₭ +₮ +₯ +₰ +₱ +₲ +₳ +₴ +₵ +₶ +₷ +₸ +₹ +₺ +₻ +₼ +₽ +₾ +↚ +↛ +↜ +↝ +↞ +↟ +↠ +↡ +↢ +↣ +↤ +↥ +↦ +↧ +↨ +↩ +↪ +↫ +↬ +↭ +↮ +↯ +↰ +↱ +↲ +↳ +↴ +↵ +↶ +↷ +↸ +↹ +⇄ +⇅ +⇆ +⇇ +⇈ +⇉ +⇊ +⇍ +⇎ +⇏ +⇑ +⇓ +⇕ +⇖ +⇗ +⇘ +⇙ +⇚ +⇛ +⇜ +⇝ +⇞ +⇟ +⇠ +⇡ +⇢ +⇣ +⇤ +⇥ +⇦ +⇧ +⇨ +⇩ +⇪ +⇫ +⇬ +⇭ +⇮ +⇯ +⇰ +⇱ +⇲ +⇳ +⇴ +⇵ +⇶ +⇷ +⇸ +⇹ +⇺ +⇻ +⇼ +⇽ +⇾ +⇿ +ↀ +ↁ +ↂ +☀ +☁ +☂ +☃ +☄ +★ +☇ +☈ +☉ +☊ +☋ +☌ +☍ +☎ +☏ +☐ +☑ +☒ +☓ +☔ +☕ +☖ +☗ +☘ +☙ +☚ +☛ +☜ +☝ +☞ +☟ +☠ +☡ +☢ +☣ +☤ +☥ +☦ +☧ +☨ +☩ +☪ +☫ +☬ +☭ +☮ +☯ +☸ +☹ +☺ +☻ +☼ +☽ +☾ +☿ +♁ +♃ +♄ +♅ +♆ +♇ +♔ +♕ +♖ +♗ +♘ +♙ +♚ +♛ +♜ +♝ +♞ +♟ +♠ +♡ +♢ +♣ +♤ +♥ +♦ +♧ +♨ +♭ +♮ +♯ +♰ +♱ +♲ +♻ +♼ +♽ +♾ +⚀ +⚁ +⚂ +⚃ +⚄ +⚅ +⚆ +⚇ +⚈ +⚉ +⚊ +⚋ +⚌ +⚍ +⚎ +⚏ +⚐ +⚑ +⚒ +⚓ +⚔ +⚕ +⚖ +⚗ +⚘ +⚙ +⚚ +⚛ +⚜ +⚝ +⚞ +⚟ +⚠ +⚡ +⚢ +⚣ +⚤ +⚥ +⚦ +⚧ +⚨ +⚩ +⚭ +⚮ +⚯ +⚰ +⚱ +⚲ +⚳ +⚴ +⚵ +⚶ +⚷ +⚸ +⚹ +⚺ +⚻ +⚼ +⚿ +⛀ +⛁ +⛂ +⛃ +⛆ +⛇ +⛈ +⛉ +⛊ +⛋ +⛌ +⛍ +⛏ +⛐ +⛑ +⛒ +⛓ +⛕ +⛖ +⛗ +⛘ +⛙ +⛚ +⛛ +⛜ +⛝ +⛞ +⛠ +⛡ +⛢ +⛣ +⛤ +⛥ +⛦ +⛧ +⛨ +⛩ +⛪ +⛫ +⛬ +⛭ +⛮ +⛯ +⛶ +⛾ +⛿ +✆ +✇ +✈ +✉ +✌ +✍ +✎ +✏ +✐ +✑ +✒ +✓ +✔ +✕ +✙ +✚ +✛ +✜ +✝ +✞ +✟ +✠ +✡ +✢ +✣ +✤ +✥ +✦ +✧ +✩ +✪ +✫ +✬ +✭ +✮ +✯ +✰ +✱ +✲ +✳ +✴ +✵ +✹ +✺ +✻ +✼ +✽ +✾ +✿ +❀ +❁ +❂ +❃ +❄ +❅ +❆ +❇ +❈ +❉ +❊ +❋ +❍ +❏ +❐ +❑ +❒ +❖ +❘ +❙ +❚ +❛ +❜ +❝ +❞ +❡ +❢ +❣ +❤ +❥ +❦ +❧ +❨ +❩ +❪ +❫ +❬ +❭ +❮ +❯ +❰ +❱ +❲ +❳ +❴ +❵ +❶ +❷ +❸ +❹ +❺ +❻ +❼ +❽ +❾ +❿ +① +② +③ +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +➔ +➕ +➖ +➗ +➘ +➙ +➚ +➛ +➜ +➝ +➞ +➟ +➠ +➡ +➢ +➣ +➤ +➥ +➦ +➧ +➨ +➩ +➪ +➫ +➬ +➭ +➮ +➯ +➰ +➱ +➲ +➳ +➴ +➵ +➶ +➷ +➸ +➹ +➺ +➻ +➼ +➽ +➾ +➿ +⌘ +⌥ +⌃ +⎋ +⌫ +⌦ +⏏ +⌤ +⌧ +⌨ +⎆ +⎇ +⎈ +⎉ +⎊ +⎌ +⎍ +⎎ +⎏ +⎐ +⎑ +⎒ +⎓ +⎕ +⎖ +⎗ +⎘ +⎙ +⎚ +⎛ +⎜ +⎝ +⎞ +⎟ +⎠ +⎡ +⎢ +⎣ +⎤ +⎥ +⎦ +⎧ +⎨ +⎩ +⎪ +⎫ +⎬ +⎭ +⎮ +⎯ +⎰ +⎱ +⎲ +⎳ +⎴ +⎵ +⎶ +⎷ +⎸ +⎹ +⎺ +⎻ +⎼ +⎽ +⎾ +⎿ +⏍ +⏎ +⏐ +⏑ +⏒ +⏓ +⏔ +⏕ +⏖ +⏗ +⏘ +⏙ +⏛ +⏜ +⏝ +⏞ +⏟ +⏠ +⏡ +⏢ +⏣ +⏤ +⏥ +⏦ +⏧ +⏨ +⏭ +⏮ +⏯ +⏱ +⏲ +▲ +▽ +◐ +⏽ +⏾ +⏿ +ɐ +ɑ +ɒ +ɓ +ɔ +ɕ +ɖ +ɗ +ɘ +ə +ɚ +ɛ +ɜ +ɝ +ɞ +ɟ +ɠ +ɡ +ɢ +ɣ +ɤ +ɥ +ɦ +ɧ +ɨ +ɩ +ɪ +ɫ +ɬ +ɭ +ɮ +ɯ +ɰ +ɱ +ɲ +ɳ +ɴ +ɵ +ɶ +ɷ +ɸ +ɹ +ɺ +ɻ +ɼ +ɽ +ɾ +ɿ +ʀ +ʁ +ʂ +ʃ +ʄ +ʅ +ʆ +ʇ +ʈ +ʉ +ʊ +ʋ +ʌ +ʍ +ʎ +ʏ +ʐ +ʑ +ʒ +ʓ +ʔ +ʕ +ʖ +ʗ +ʘ +ʙ +ʚ +ʛ +ʜ +ʝ +ʞ +ʟ +ʠ +ʡ +ʢ +ʣ +ʤ +ʥ +ʦ +ʧ +ʨ +ʩ +ʪ +ʫ +ʬ +ʭ +ʮ +ʯ +━ +Ǝ +à +● +▶ +| +𝑢 +〖 +〗 +︽ +– +﹥ +𝜓 +• +∋ +ƒ +० +✘ +Е +◉ +〒 +𝒱 +𝜆 +⟹ +﹪ +◊ +╆ +오 +˂ +〉 +𝝎 +▪ +△ +▁ +◼ +〇 +▷ +▬ +𝒮 +† +ₒ +⼁ +〵 +⭐ +╳ +⟶ +으 +⬆ +Ạ +◀ + +▫ +丄 +︾ +◥ +‖ +𝜌 +ⅼ +▼ +⁎ +﹏ +😁 +😂 +😃 +😄 +😅 +😆 +😉 +😊 +😋 +😌 +😍 +😏 +😒 +😓 +😔 +😖 +😘 +😚 +😜 +😝 +😞 +😠 +😡 +😢 +😣 +😤 +😥 +😨 +😩 +😪 +😫 +😭 +😰 +😱 +😲 +😳 +😵 +😷 +😸 +😹 +😺 +😻 +😼 +😽 +😾 +😿 +🙀 +🙅 +🙆 +🙇 +🙈 +🙉 +🙊 +🙋 +🙌 +🙍 +🙎 +🙏 +✂ +✅ +✊ +✋ +✖ +✨ +❌ +❎ +❓ +❔ +❕ +❗ +🚀 +🚃 +🚄 +🚅 +🚇 +🚉 +🚌 +🚏 +🚑 +🚒 +🚓 +🚕 +🚗 +🚙 +🚚 +🚢 +🚤 +🚥 +🚧 +🚨 +🚩 +🚪 +🚫 +🚬 +🚭 +🚲 +🚶 +🚹 +🚺 +🚻 +🚼 +🚽 +🚾 +🛀 +Ⓜ +🅰 +🅱 +🅾 +🅿 +🆎 +🆑 +🆒 +🆓 +🆔 +🆕 +🆖 +🆗 +🆘 +🆙 +🆚 +🇩🇪 +🇬🇧 +🇨🇳 +🇯🇵 +🇫🇷 +🇰🇷 +🇪🇸 +🇮🇹 +🇷🇺 +🇺🇸 +🈁 +ℹ +⌚ +⌛ +⏩ +⏪ +⏫ +⏬ +⏰ +⏳ +◻ +◽ +◾ +♈ +♉ +♊ +♋ +♌ +♍ +♎ +♏ +♐ +♑ +♒ +♓ +♿ +⚽ +⚾ +⛄ +⛅ +⛎ +⛔ +⛲ +⛳ +⛵ +⛺ +⛽ +⤴ +⤵ +⬅ +⬇ +⬛ +⬜ +⭕ +〰 +〽 +㊗ +㊙ +🀄 +🃏 +🌀 +🌁 +🌂 +🌃 +🌄 +🌅 +🌆 +🌇 +🌈 +🌉 +🌊 +🌋 +🌌 +🌏 +🌑 +🌓 +🌔 +🌕 +🌙 +🌛 +🌟 +🌠 +🌰 +🌱 +🌴 +🌵 +🌷 +🌸 +🌹 +🌺 +🌻 +🌼 +🌽 +🌾 +🌿 +🍀 +🍁 +🍂 +🍃 +🍄 +🍅 +🍆 +🍇 +🍈 +🍉 +🍊 +🍌 +🍍 +🍎 +🍏 +🍑 +🍒 +🍓 +🍔 +🍕 +🍖 +🍗 +🍘 +🍙 +🍚 +🍛 +🍜 +🍝 +🍞 +🍟 +🍠 +🍡 +🍢 +🍣 +🍤 +🍥 +🍦 +🍧 +🍨 +🍩 +🍪 +🍫 +🍬 +🍭 +🍮 +🍯 +🍰 +🍱 +🍲 +🍳 +🍴 +🍵 +🍶 +🍷 +🍸 +🍹 +🍺 +🍻 +🎀 +🎁 +🎂 +🎃 +🎄 +🎅 +🎆 +🎇 +🎈 +🎉 +🎊 +🎋 +🎌 +🎍 +🎎 +🎏 +🎐 +🎑 +🎒 +🎓 +🎠 +🎡 +🎢 +🎣 +🎤 +🎥 +🎦 +🎧 +🎨 +🎩 +🎪 +🎫 +🎬 +🎭 +🎮 +🎯 +🎰 +🎱 +🎲 +🎳 +🎴 +🎵 +🎶 +🎷 +🎸 +🎹 +🎺 +🎻 +🎼 +🎽 +🎾 +🎿 +🏀 +🏁 +🏂 +🏃 +🏄 +🏆 +🏈 +🏊 +🏠 +🏡 +🏢 +🏣 +🏥 +🏦 +🏧 +🏨 +🏩 +🏪 +🏫 +🏬 +🏭 +🏮 +🏯 +🏰 +🐌 +🐍 +🐎 +🐑 +🐒 +🐔 +🐗 +🐘 +🐙 +🐚 +🐛 +🐜 +🐝 +🐞 +🐟 +🐠 +🐡 +🐢 +🐣 +🐤 +🐥 +🐦 +🐧 +🐨 +🐩 +🐫 +🐬 +🐭 +🐮 +🐯 +🐰 +🐱 +🐲 +🐳 +🐴 +🐵 +🐶 +🐷 +🐸 +🐹 +🐺 +🐻 +🐼 +🐽 +🐾 +👀 +👂 +👃 +👄 +👅 +👆 +👇 +👈 +👉 +👊 +👋 +👌 +👍 +👎 +👏 +👐 +👑 +👒 +👓 +👔 +👕 +👖 +👗 +👘 +👙 +👚 +👛 +👜 +👝 +👞 +👟 +👠 +👡 +👢 +👣 +👤 +👦 +👧 +👨 +👩 +👪 +👫 +👮 +👯 +👰 +👱 +👲 +👳 +👴 +👵 +👶 +👷 +👸 +👹 +👺 +👻 +👼 +👽 +👾 +👿 +💀 +💁 +💂 +💃 +💄 +💅 +💆 +💇 +💈 +💉 +💊 +💋 +💌 +💍 +💎 +💏 +💐 +💑 +💒 +💓 +💔 +💕 +💖 +💗 +💘 +💙 +💚 +💛 +💜 +💝 +💞 +💟 +💠 +💡 +💢 +💣 +💤 +💥 +💦 +💧 +💨 +💩 +💪 +💫 +💬 +💮 +💯 +💰 +💲 +💳 +💴 +💵 +💸 +💹 +💺 +💻 +💼 +💽 +💾 +💿 +📀 +📁 +📂 +📃 +📄 +📅 +📆 +📇 +📈 +📉 +📊 +📋 +📌 +📍 +📎 +📏 +📐 +📑 +📒 +📓 +📔 +📕 +📖 +📗 +📘 +📙 +📚 +📛 +📜 +📝 +📞 +📟 +📠 +📡 +📢 +📣 +📤 +📥 +📦 +📧 +📨 +📩 +📪 +📫 +📮 +📰 +📱 +📲 +📳 +📴 +📶 +📷 +📹 +📺 +📻 +📼 +🔃 +🔊 +🔋 +🔌 +🔍 +🔎 +🔏 +🔐 +🔑 +🔒 +🔓 +🔔 +🔖 +🔗 +🔘 +🔙 +🔚 +🔛 +🔜 +🔝 +🔞 +🔟 +🔠 +🔡 +🔢 +🔣 +🔤 +🔥 +🔦 +🔧 +🔨 +🔩 +🔪 +🔫 +🔮 +🔯 +🔰 +🔱 +🔲 +🔳 +🔴 +🔵 +🔶 +🔷 +🔸 +🔹 +🔺 +🔻 +🔼 +🔽 +🕐 +🕑 +🕒 +🕓 +🕔 +🕕 +🕖 +🕗 +🕘 +🕙 +🕚 +🕛 +🗻 +🗼 +🗽 +🗾 +🗿 +😀 +😇 +😈 +😎 +😐 +😑 +😕 +😗 +😙 +😛 +😟 +😦 +😧 +😬 +😮 +😯 +😴 +😶 +🚁 +🚂 +🚆 +🚈 +🚊 +🚍 +🚎 +🚐 +🚔 +🚖 +🚘 +🚛 +🚜 +🚝 +🚞 +🚟 +🚠 +🚡 +🚣 +🚦 +🚮 +🚯 +🚰 +🚱 +🚳 +🚴 +🚵 +🚷 +🚸 +🚿 +🛁 +🛂 +🛃 +🛄 +🛅 +🌍 +🌎 +🌐 +🌒 +🌖 +🌗 +🌘 +🌚 +🌜 +🌝 +🌞 +🌲 +🌳 +🍋 +🍐 +🍼 +🏇 +🏉 +🏤 +🐀 +🐁 +🐂 +🐃 +🐄 +🐅 +🐆 +🐇 +🐈 +🐉 +🐊 +🐋 +🐏 +🐐 +🐓 +🐕 +🐖 +🐪 +👥 +👬 +👭 +💭 +💶 +💷 +📬 +📭 +📯 +📵 +🔀 +🔁 +🔂 +🔄 +🔅 +🔆 +🔇 +🔉 +🔕 +🔬 +🔭 +🕜 +🕝 +🕞 +🕟 +🕠 +🕡 +🕢 +🕣 +🕤 +🕥 +🕦 +🕧 diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..d50d6e5a267d9269eff250e9c4b88eb0c99dfb5c --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_eslav_dict.txt @@ -0,0 +1,517 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +© +‥ +{ +} +\ +| +@ +^ +~ +÷ +∕ +∙ +⋅ +· +± +∓ +∩ +∪ +□ +← +↔ +⇒ +⇐ +⇔ +∀ +∃ +∄ +∴ +∵ +∝ +∞ +⊥ +∟ +∠ +∡ +∢ +′ +″ +∥ +⊾ +⊿ +∂ +∫ +∬ +∭ +∮ +∯ +∰ +∑ +∏ +√ +∛ +∜ +∱ +∲ +∳ +∶ +∷ +∼ +® +℉ +Ω +℧ +Å +⌀ +ℏ +⅀ +⍺ +⍵ +¢ +€ +£ +¥ +₿ +Ⅰ +Ⅱ +Ⅲ +Ⅳ +Ⅴ +Ⅵ +Ⅶ +Ⅷ +Ⅸ +Ⅹ +Ⅺ +Ⅻ +ⅰ +ⅱ +ⅲ +ⅳ +ⅴ +ⅵ +ⅶ +ⅷ +ⅸ +ⅹ +ⅺ +ⅻ +➀ +➁ +➂ +➃ +➄ +➅ +➆ +➇ +➈ +➉ +➊ +➋ +➌ +➍ +➎ +➏ +➐ +➑ +➒ +➓ +❶ +❷ +❸ +❹ +❺ +❻ +❼ +❽ +❾ +❿ +① +② +③ +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +● +▶ +𝑢 +︽ +– +﹥ +𝜓 +• +∋ +ƒ +० +⬆ +Ạ +◀ + +▫ +︾ +À +Á + +à +Ä +Å +Æ +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ð +Ñ +Ò +Ó +Ô +Õ +Ö +Ø +Ù +Ú +Û +Ü +Ý +Þ +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ð +ñ +ò +ó +ô +õ +ö +ø +ù +ú +û +ü +ý +þ +ÿ +¡ +¤ +¦ +§ +¨ +ª +« +¬ +¯ +° +² +³ +´ +µ +¶ +¸ +¹ +º +» +¼ +½ +¾ +¿ +× +‐ +‑ +‒ +— +― +‖ +‗ +‘ +’ +‚ +‛ +“ +” +„ +‟ +† +‡ +‣ +․ +… +‧ +‰ +‴ +‵ +‶ +‷ +‸ +‹ +› +※ +‼ +‽ +‾ +₤ +₡ +₹ +− +∖ +∗ +≈ +≠ +≡ +≤ +≥ +⊂ +⊃ +↑ +→ +↓ +↕ +™ +Ω +℮ +∆ +✓ +✗ +✘ +▪ +◼ +✔ +✕ +☑ +☒ +№ +₽ +₴ +Α +α +Β +β +Γ +γ +Δ +δ +Ε +ε +Ζ +ζ +Η +η +Θ +θ +Ι +ι +Κ +κ +Λ +λ +Μ +μ +Ν +ν +Ξ +ξ +Ο +ο +Π +π +Ρ +ρ +Σ +σ +ς +Τ +τ +Υ +υ +Φ +φ +Χ +χ +Ψ +ψ +ω +А +Б +В +Г +Ґ +Д +Е +Ё +Є +Ж +З +И +І +Ї +Й +К +Л +М +Н +О +П +Р +С +Т +У +Ў +Ф +Х +Ц +Ч +Ш +Щ +Ъ +Ы +Ь +Э +Ю +Я +а +б +в +г +ґ +д +е +ё +є +ж +з +и +і +ї +й +к +л +м +н +о +п +р +с +т +у +ў +ф +х +ц +ч +ш +щ +ъ +ы +ь +э +ю +я diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2202631098b4e49662ed4d29a380df4cbf6de78 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_korean_dict.txt @@ -0,0 +1,11945 @@ +ᄀ +ᄁ +ᄂ +ᄃ +ᄄ +ᄅ +ᄆ +ᄇ +ᄈ +ᄉ +ᄊ +ᄋ +ᄌ +ᄍ +ᄎ +ᄏ +ᄐ +ᄑ +ᄒ +ᄓ +ᄔ +ᄕ +ᄖ +ᄗ +ᄘ +ᄙ +ᄚ +ᄛ +ᄜ +ᄝ +ᄞ +ᄟ +ᄠ +ᄡ +ᄢ +ᄣ +ᄤ +ᄥ +ᄦ +ᄧ +ᄨ +ᄩ +ᄪ +ᄫ +ᄬ +ᄭ +ᄮ +ᄯ +ᄰ +ᄱ +ᄲ +ᄳ +ᄴ +ᄵ +ᄶ +ᄷ +ᄸ +ᄹ +ᄺ +ᄻ +ᄼ +ᄽ +ᄾ +ᄿ +ᅀ +ᅁ +ᅂ +ᅃ +ᅄ +ᅅ +ᅆ +ᅇ +ᅈ +ᅉ +ᅊ +ᅋ +ᅌ +ᅍ +ᅎ +ᅏ +ᅐ +ᅑ +ᅒ +ᅓ +ᅔ +ᅕ +ᅖ +ᅗ +ᅘ +ᅙ +ᅡ +ᅢ +ᅣ +ᅤ +ᅥ +ᅦ +ᅧ +ᅨ +ᅩ +ᅪ +ᅫ +ᅬ +ᅭ +ᅮ +ᅯ +ᅰ +ᅱ +ᅲ +ᅳ +ᅴ +ᅵ +ᅶ +ᅷ +ᅸ +ᅹ +ᅺ +ᅻ +ᅼ +ᅽ +ᅾ +ᅿ +ᆀ +ᆁ +ᆂ +ᆃ +ᆄ +ᆅ +ᆆ +ᆇ +ᆈ +ᆉ +ᆊ +ᆋ +ᆌ +ᆍ +ᆎ +ᆏ +ᆐ +ᆑ +ᆒ +ᆓ +ᆔ +ᆕ +ᆖ +ᆗ +ᆘ +ᆙ +ᆚ +ᆛ +ᆜ +ᆝ +ᆞ +ᆟ +ᆠ +ᆡ +ᆢ +ᆨ +ᆩ +ᆪ +ᆫ +ᆬ +ᆭ +ᆮ +ᆯ +ᆰ +ᆱ +ᆲ +ᆳ +ᆴ +ᆵ +ᆶ +ᆷ +ᆸ +ᆹ +ᆺ +ᆻ +ᆼ +ᆽ +ᆾ +ᆿ +ᇀ +ᇁ +ᇂ +ᇃ +ᇄ +ᇅ +ᇆ +ᇇ +ᇈ +ᇉ +ᇊ +ᇋ +ᇌ +ᇍ +ᇎ +ᇏ +ᇐ +ᇑ +ᇒ +ᇓ +ᇔ +ᇕ +ᇖ +ᇗ +ᇘ +ᇙ +ᇚ +ᇛ +ᇜ +ᇝ +ᇞ +ᇟ +ᇠ +ᇡ +ᇢ +ᇣ +ᇤ +ᇥ +ᇦ +ᇧ +ᇨ +ᇩ +ᇪ +ᇫ +ᇬ +ᇭ +ᇮ +ᇯ +ᇰ +ᇱ +ᇲ +ᇳ +ᇴ +ᇵ +ᇶ +ᇷ +ᇸ +ᇹ +ㄱ +ㄲ +ㄳ +ㄴ +ㄵ +ㄶ +ㄷ +ㄸ +ㄹ +ㄺ +ㄻ +ㄼ +ㄽ +ㄾ +ㄿ +ㅀ +ㅁ +ㅂ +ㅃ +ㅄ +ㅅ +ㅆ +ㅇ +ㅈ +ㅉ +ㅊ +ㅋ +ㅌ +ㅍ +ㅎ +ㅏ +ㅐ +ㅑ +ㅒ +ㅓ +ㅔ +ㅕ +ㅖ +ㅗ +ㅘ +ㅙ +ㅚ +ㅛ +ㅜ +ㅝ +ㅞ +ㅟ +ㅠ +ㅡ +ㅢ +ㅣ +ㅤ +ㅥ +ㅦ +ㅧ +ㅨ +ㅩ +ㅪ +ㅫ +ㅬ +ㅭ +ㅮ +ㅯ +ㅰ +ㅱ +ㅲ +ㅳ +ㅴ +ㅵ +ㅶ +ㅷ +ㅸ +ㅹ +ㅺ +ㅻ +ㅼ +ㅽ +ㅾ +ㅿ +ㆀ +ㆁ +ㆂ +ㆃ +ㆄ +ㆅ +ㆆ +ㆇ +ㆈ +ㆉ +ㆊ +ㆋ +ㆌ +ㆍ +ㆎ +가 +각 +갂 +갃 +간 +갅 +갆 +갇 +갈 +갉 +갊 +갋 +갌 +갍 +갎 +갏 +감 +갑 +값 +갓 +갔 +강 +갖 +갗 +갘 +같 +갚 +갛 +개 +객 +갞 +갟 +갠 +갡 +갢 +갣 +갤 +갥 +갦 +갧 +갨 +갩 +갪 +갫 +갬 +갭 +갮 +갯 +갰 +갱 +갲 +갳 +갴 +갵 +갶 +갷 +갸 +갹 +갺 +갻 +갼 +갽 +갾 +갿 +걀 +걁 +걂 +걃 +걄 +걅 +걆 +걇 +걈 +걉 +걊 +걋 +걌 +걍 +걎 +걏 +걐 +걑 +걒 +걓 +걔 +걕 +걖 +걗 +걘 +걙 +걚 +걛 +걜 +걝 +걞 +걟 +걠 +걡 +걢 +걣 +걤 +걥 +걦 +걧 +걨 +걩 +걪 +걫 +걬 +걭 +걮 +걯 +거 +걱 +걲 +걳 +건 +걵 +걶 +걷 +걸 +걹 +걺 +걻 +걼 +걽 +걾 +걿 +검 +겁 +겂 +것 +겄 +겅 +겆 +겇 +겈 +겉 +겊 +겋 +게 +겍 +겎 +겏 +겐 +겑 +겒 +겓 +겔 +겕 +겖 +겗 +겘 +겙 +겚 +겛 +겜 +겝 +겞 +겟 +겠 +겡 +겢 +겣 +겤 +겥 +겦 +겧 +겨 +격 +겪 +겫 +견 +겭 +겮 +겯 +결 +겱 +겲 +겳 +겴 +겵 +겶 +겷 +겸 +겹 +겺 +겻 +겼 +경 +겾 +겿 +곀 +곁 +곂 +곃 +계 +곅 +곆 +곇 +곈 +곉 +곊 +곋 +곌 +곍 +곎 +곏 +곐 +곑 +곒 +곓 +곔 +곕 +곖 +곗 +곘 +곙 +곚 +곛 +곜 +곝 +곞 +곟 +고 +곡 +곢 +곣 +곤 +곥 +곦 +곧 +골 +곩 +곪 +곫 +곬 +곭 +곮 +곯 +곰 +곱 +곲 +곳 +곴 +공 +곶 +곷 +곸 +곹 +곺 +곻 +과 +곽 +곾 +곿 +관 +괁 +괂 +괃 +괄 +괅 +괆 +괇 +괈 +괉 +괊 +괋 +괌 +괍 +괎 +괏 +괐 +광 +괒 +괓 +괔 +괕 +괖 +괗 +괘 +괙 +괚 +괛 +괜 +괝 +괞 +괟 +괠 +괡 +괢 +괣 +괤 +괥 +괦 +괧 +괨 +괩 +괪 +괫 +괬 +괭 +괮 +괯 +괰 +괱 +괲 +괳 +괴 +괵 +괶 +괷 +괸 +괹 +괺 +괻 +괼 +괽 +괾 +괿 +굀 +굁 +굂 +굃 +굄 +굅 +굆 +굇 +굈 +굉 +굊 +굋 +굌 +굍 +굎 +굏 +교 +굑 +굒 +굓 +굔 +굕 +굖 +굗 +굘 +굙 +굚 +굛 +굜 +굝 +굞 +굟 +굠 +굡 +굢 +굣 +굤 +굥 +굦 +굧 +굨 +굩 +굪 +굫 +구 +국 +굮 +굯 +군 +굱 +굲 +굳 +굴 +굵 +굶 +굷 +굸 +굹 +굺 +굻 +굼 +굽 +굾 +굿 +궀 +궁 +궂 +궃 +궄 +궅 +궆 +궇 +궈 +궉 +궊 +궋 +권 +궍 +궎 +궏 +궐 +궑 +궒 +궓 +궔 +궕 +궖 +궗 +궘 +궙 +궚 +궛 +궜 +궝 +궞 +궟 +궠 +궡 +궢 +궣 +궤 +궥 +궦 +궧 +궨 +궩 +궪 +궫 +궬 +궭 +궮 +궯 +궰 +궱 +궲 +궳 +궴 +궵 +궶 +궷 +궸 +궹 +궺 +궻 +궼 +궽 +궾 +궿 +귀 +귁 +귂 +귃 +귄 +귅 +귆 +귇 +귈 +귉 +귊 +귋 +귌 +귍 +귎 +귏 +귐 +귑 +귒 +귓 +귔 +귕 +귖 +귗 +귘 +귙 +귚 +귛 +규 +귝 +귞 +귟 +균 +귡 +귢 +귣 +귤 +귥 +귦 +귧 +귨 +귩 +귪 +귫 +귬 +귭 +귮 +귯 +귰 +귱 +귲 +귳 +귴 +귵 +귶 +귷 +그 +극 +귺 +귻 +근 +귽 +귾 +귿 +글 +긁 +긂 +긃 +긄 +긅 +긆 +긇 +금 +급 +긊 +긋 +긌 +긍 +긎 +긏 +긐 +긑 +긒 +긓 +긔 +긕 +긖 +긗 +긘 +긙 +긚 +긛 +긜 +긝 +긞 +긟 +긠 +긡 +긢 +긣 +긤 +긥 +긦 +긧 +긨 +긩 +긪 +긫 +긬 +긭 +긮 +긯 +기 +긱 +긲 +긳 +긴 +긵 +긶 +긷 +길 +긹 +긺 +긻 +긼 +긽 +긾 +긿 +김 +깁 +깂 +깃 +깄 +깅 +깆 +깇 +깈 +깉 +깊 +깋 +까 +깍 +깎 +깏 +깐 +깑 +깒 +깓 +깔 +깕 +깖 +깗 +깘 +깙 +깚 +깛 +깜 +깝 +깞 +깟 +깠 +깡 +깢 +깣 +깤 +깥 +깦 +깧 +깨 +깩 +깪 +깫 +깬 +깭 +깮 +깯 +깰 +깱 +깲 +깳 +깴 +깵 +깶 +깷 +깸 +깹 +깺 +깻 +깼 +깽 +깾 +깿 +꺀 +꺁 +꺂 +꺃 +꺄 +꺅 +꺆 +꺇 +꺈 +꺉 +꺊 +꺋 +꺌 +꺍 +꺎 +꺏 +꺐 +꺑 +꺒 +꺓 +꺔 +꺕 +꺖 +꺗 +꺘 +꺙 +꺚 +꺛 +꺜 +꺝 +꺞 +꺟 +꺠 +꺡 +꺢 +꺣 +꺤 +꺥 +꺦 +꺧 +꺨 +꺩 +꺪 +꺫 +꺬 +꺭 +꺮 +꺯 +꺰 +꺱 +꺲 +꺳 +꺴 +꺵 +꺶 +꺷 +꺸 +꺹 +꺺 +꺻 +꺼 +꺽 +꺾 +꺿 +껀 +껁 +껂 +껃 +껄 +껅 +껆 +껇 +껈 +껉 +껊 +껋 +껌 +껍 +껎 +껏 +껐 +껑 +껒 +껓 +껔 +껕 +껖 +껗 +께 +껙 +껚 +껛 +껜 +껝 +껞 +껟 +껠 +껡 +껢 +껣 +껤 +껥 +껦 +껧 +껨 +껩 +껪 +껫 +껬 +껭 +껮 +껯 +껰 +껱 +껲 +껳 +껴 +껵 +껶 +껷 +껸 +껹 +껺 +껻 +껼 +껽 +껾 +껿 +꼀 +꼁 +꼂 +꼃 +꼄 +꼅 +꼆 +꼇 +꼈 +꼉 +꼊 +꼋 +꼌 +꼍 +꼎 +꼏 +꼐 +꼑 +꼒 +꼓 +꼔 +꼕 +꼖 +꼗 +꼘 +꼙 +꼚 +꼛 +꼜 +꼝 +꼞 +꼟 +꼠 +꼡 +꼢 +꼣 +꼤 +꼥 +꼦 +꼧 +꼨 +꼩 +꼪 +꼫 +꼬 +꼭 +꼮 +꼯 +꼰 +꼱 +꼲 +꼳 +꼴 +꼵 +꼶 +꼷 +꼸 +꼹 +꼺 +꼻 +꼼 +꼽 +꼾 +꼿 +꽀 +꽁 +꽂 +꽃 +꽄 +꽅 +꽆 +꽇 +꽈 +꽉 +꽊 +꽋 +꽌 +꽍 +꽎 +꽏 +꽐 +꽑 +꽒 +꽓 +꽔 +꽕 +꽖 +꽗 +꽘 +꽙 +꽚 +꽛 +꽜 +꽝 +꽞 +꽟 +꽠 +꽡 +꽢 +꽣 +꽤 +꽥 +꽦 +꽧 +꽨 +꽩 +꽪 +꽫 +꽬 +꽭 +꽮 +꽯 +꽰 +꽱 +꽲 +꽳 +꽴 +꽵 +꽶 +꽷 +꽸 +꽹 +꽺 +꽻 +꽼 +꽽 +꽾 +꽿 +꾀 +꾁 +꾂 +꾃 +꾄 +꾅 +꾆 +꾇 +꾈 +꾉 +꾊 +꾋 +꾌 +꾍 +꾎 +꾏 +꾐 +꾑 +꾒 +꾓 +꾔 +꾕 +꾖 +꾗 +꾘 +꾙 +꾚 +꾛 +꾜 +꾝 +꾞 +꾟 +꾠 +꾡 +꾢 +꾣 +꾤 +꾥 +꾦 +꾧 +꾨 +꾩 +꾪 +꾫 +꾬 +꾭 +꾮 +꾯 +꾰 +꾱 +꾲 +꾳 +꾴 +꾵 +꾶 +꾷 +꾸 +꾹 +꾺 +꾻 +꾼 +꾽 +꾾 +꾿 +꿀 +꿁 +꿂 +꿃 +꿄 +꿅 +꿆 +꿇 +꿈 +꿉 +꿊 +꿋 +꿌 +꿍 +꿎 +꿏 +꿐 +꿑 +꿒 +꿓 +꿔 +꿕 +꿖 +꿗 +꿘 +꿙 +꿚 +꿛 +꿜 +꿝 +꿞 +꿟 +꿠 +꿡 +꿢 +꿣 +꿤 +꿥 +꿦 +꿧 +꿨 +꿩 +꿪 +꿫 +꿬 +꿭 +꿮 +꿯 +꿰 +꿱 +꿲 +꿳 +꿴 +꿵 +꿶 +꿷 +꿸 +꿹 +꿺 +꿻 +꿼 +꿽 +꿾 +꿿 +뀀 +뀁 +뀂 +뀃 +뀄 +뀅 +뀆 +뀇 +뀈 +뀉 +뀊 +뀋 +뀌 +뀍 +뀎 +뀏 +뀐 +뀑 +뀒 +뀓 +뀔 +뀕 +뀖 +뀗 +뀘 +뀙 +뀚 +뀛 +뀜 +뀝 +뀞 +뀟 +뀠 +뀡 +뀢 +뀣 +뀤 +뀥 +뀦 +뀧 +뀨 +뀩 +뀪 +뀫 +뀬 +뀭 +뀮 +뀯 +뀰 +뀱 +뀲 +뀳 +뀴 +뀵 +뀶 +뀷 +뀸 +뀹 +뀺 +뀻 +뀼 +뀽 +뀾 +뀿 +끀 +끁 +끂 +끃 +끄 +끅 +끆 +끇 +끈 +끉 +끊 +끋 +끌 +끍 +끎 +끏 +끐 +끑 +끒 +끓 +끔 +끕 +끖 +끗 +끘 +끙 +끚 +끛 +끜 +끝 +끞 +끟 +끠 +끡 +끢 +끣 +끤 +끥 +끦 +끧 +끨 +끩 +끪 +끫 +끬 +끭 +끮 +끯 +끰 +끱 +끲 +끳 +끴 +끵 +끶 +끷 +끸 +끹 +끺 +끻 +끼 +끽 +끾 +끿 +낀 +낁 +낂 +낃 +낄 +낅 +낆 +낇 +낈 +낉 +낊 +낋 +낌 +낍 +낎 +낏 +낐 +낑 +낒 +낓 +낔 +낕 +낖 +낗 +나 +낙 +낚 +낛 +난 +낝 +낞 +낟 +날 +낡 +낢 +낣 +낤 +낥 +낦 +낧 +남 +납 +낪 +낫 +났 +낭 +낮 +낯 +낰 +낱 +낲 +낳 +내 +낵 +낶 +낷 +낸 +낹 +낺 +낻 +낼 +낽 +낾 +낿 +냀 +냁 +냂 +냃 +냄 +냅 +냆 +냇 +냈 +냉 +냊 +냋 +냌 +냍 +냎 +냏 +냐 +냑 +냒 +냓 +냔 +냕 +냖 +냗 +냘 +냙 +냚 +냛 +냜 +냝 +냞 +냟 +냠 +냡 +냢 +냣 +냤 +냥 +냦 +냧 +냨 +냩 +냪 +냫 +냬 +냭 +냮 +냯 +냰 +냱 +냲 +냳 +냴 +냵 +냶 +냷 +냸 +냹 +냺 +냻 +냼 +냽 +냾 +냿 +넀 +넁 +넂 +넃 +넄 +넅 +넆 +넇 +너 +넉 +넊 +넋 +넌 +넍 +넎 +넏 +널 +넑 +넒 +넓 +넔 +넕 +넖 +넗 +넘 +넙 +넚 +넛 +넜 +넝 +넞 +넟 +넠 +넡 +넢 +넣 +네 +넥 +넦 +넧 +넨 +넩 +넪 +넫 +넬 +넭 +넮 +넯 +넰 +넱 +넲 +넳 +넴 +넵 +넶 +넷 +넸 +넹 +넺 +넻 +넼 +넽 +넾 +넿 +녀 +녁 +녂 +녃 +년 +녅 +녆 +녇 +녈 +녉 +녊 +녋 +녌 +녍 +녎 +녏 +념 +녑 +녒 +녓 +녔 +녕 +녖 +녗 +녘 +녙 +녚 +녛 +녜 +녝 +녞 +녟 +녠 +녡 +녢 +녣 +녤 +녥 +녦 +녧 +녨 +녩 +녪 +녫 +녬 +녭 +녮 +녯 +녰 +녱 +녲 +녳 +녴 +녵 +녶 +녷 +노 +녹 +녺 +녻 +논 +녽 +녾 +녿 +놀 +놁 +놂 +놃 +놄 +놅 +놆 +놇 +놈 +놉 +놊 +놋 +놌 +농 +놎 +놏 +놐 +놑 +높 +놓 +놔 +놕 +놖 +놗 +놘 +놙 +놚 +놛 +놜 +놝 +놞 +놟 +놠 +놡 +놢 +놣 +놤 +놥 +놦 +놧 +놨 +놩 +놪 +놫 +놬 +놭 +놮 +놯 +놰 +놱 +놲 +놳 +놴 +놵 +놶 +놷 +놸 +놹 +놺 +놻 +놼 +놽 +놾 +놿 +뇀 +뇁 +뇂 +뇃 +뇄 +뇅 +뇆 +뇇 +뇈 +뇉 +뇊 +뇋 +뇌 +뇍 +뇎 +뇏 +뇐 +뇑 +뇒 +뇓 +뇔 +뇕 +뇖 +뇗 +뇘 +뇙 +뇚 +뇛 +뇜 +뇝 +뇞 +뇟 +뇠 +뇡 +뇢 +뇣 +뇤 +뇥 +뇦 +뇧 +뇨 +뇩 +뇪 +뇫 +뇬 +뇭 +뇮 +뇯 +뇰 +뇱 +뇲 +뇳 +뇴 +뇵 +뇶 +뇷 +뇸 +뇹 +뇺 +뇻 +뇼 +뇽 +뇾 +뇿 +눀 +눁 +눂 +눃 +누 +눅 +눆 +눇 +눈 +눉 +눊 +눋 +눌 +눍 +눎 +눏 +눐 +눑 +눒 +눓 +눔 +눕 +눖 +눗 +눘 +눙 +눚 +눛 +눜 +눝 +눞 +눟 +눠 +눡 +눢 +눣 +눤 +눥 +눦 +눧 +눨 +눩 +눪 +눫 +눬 +눭 +눮 +눯 +눰 +눱 +눲 +눳 +눴 +눵 +눶 +눷 +눸 +눹 +눺 +눻 +눼 +눽 +눾 +눿 +뉀 +뉁 +뉂 +뉃 +뉄 +뉅 +뉆 +뉇 +뉈 +뉉 +뉊 +뉋 +뉌 +뉍 +뉎 +뉏 +뉐 +뉑 +뉒 +뉓 +뉔 +뉕 +뉖 +뉗 +뉘 +뉙 +뉚 +뉛 +뉜 +뉝 +뉞 +뉟 +뉠 +뉡 +뉢 +뉣 +뉤 +뉥 +뉦 +뉧 +뉨 +뉩 +뉪 +뉫 +뉬 +뉭 +뉮 +뉯 +뉰 +뉱 +뉲 +뉳 +뉴 +뉵 +뉶 +뉷 +뉸 +뉹 +뉺 +뉻 +뉼 +뉽 +뉾 +뉿 +늀 +늁 +늂 +늃 +늄 +늅 +늆 +늇 +늈 +늉 +늊 +늋 +늌 +늍 +늎 +늏 +느 +늑 +늒 +늓 +는 +늕 +늖 +늗 +늘 +늙 +늚 +늛 +늜 +늝 +늞 +늟 +늠 +늡 +늢 +늣 +늤 +능 +늦 +늧 +늨 +늩 +늪 +늫 +늬 +늭 +늮 +늯 +늰 +늱 +늲 +늳 +늴 +늵 +늶 +늷 +늸 +늹 +늺 +늻 +늼 +늽 +늾 +늿 +닀 +닁 +닂 +닃 +닄 +닅 +닆 +닇 +니 +닉 +닊 +닋 +닌 +닍 +닎 +닏 +닐 +닑 +닒 +닓 +닔 +닕 +닖 +닗 +님 +닙 +닚 +닛 +닜 +닝 +닞 +닟 +닠 +닡 +닢 +닣 +다 +닥 +닦 +닧 +단 +닩 +닪 +닫 +달 +닭 +닮 +닯 +닰 +닱 +닲 +닳 +담 +답 +닶 +닷 +닸 +당 +닺 +닻 +닼 +닽 +닾 +닿 +대 +댁 +댂 +댃 +댄 +댅 +댆 +댇 +댈 +댉 +댊 +댋 +댌 +댍 +댎 +댏 +댐 +댑 +댒 +댓 +댔 +댕 +댖 +댗 +댘 +댙 +댚 +댛 +댜 +댝 +댞 +댟 +댠 +댡 +댢 +댣 +댤 +댥 +댦 +댧 +댨 +댩 +댪 +댫 +댬 +댭 +댮 +댯 +댰 +댱 +댲 +댳 +댴 +댵 +댶 +댷 +댸 +댹 +댺 +댻 +댼 +댽 +댾 +댿 +덀 +덁 +덂 +덃 +덄 +덅 +덆 +덇 +덈 +덉 +덊 +덋 +덌 +덍 +덎 +덏 +덐 +덑 +덒 +덓 +더 +덕 +덖 +덗 +던 +덙 +덚 +덛 +덜 +덝 +덞 +덟 +덠 +덡 +덢 +덣 +덤 +덥 +덦 +덧 +덨 +덩 +덪 +덫 +덬 +덭 +덮 +덯 +데 +덱 +덲 +덳 +덴 +덵 +덶 +덷 +델 +덹 +덺 +덻 +덼 +덽 +덾 +덿 +뎀 +뎁 +뎂 +뎃 +뎄 +뎅 +뎆 +뎇 +뎈 +뎉 +뎊 +뎋 +뎌 +뎍 +뎎 +뎏 +뎐 +뎑 +뎒 +뎓 +뎔 +뎕 +뎖 +뎗 +뎘 +뎙 +뎚 +뎛 +뎜 +뎝 +뎞 +뎟 +뎠 +뎡 +뎢 +뎣 +뎤 +뎥 +뎦 +뎧 +뎨 +뎩 +뎪 +뎫 +뎬 +뎭 +뎮 +뎯 +뎰 +뎱 +뎲 +뎳 +뎴 +뎵 +뎶 +뎷 +뎸 +뎹 +뎺 +뎻 +뎼 +뎽 +뎾 +뎿 +돀 +돁 +돂 +돃 +도 +독 +돆 +돇 +돈 +돉 +돊 +돋 +돌 +돍 +돎 +돏 +돐 +돑 +돒 +돓 +돔 +돕 +돖 +돗 +돘 +동 +돚 +돛 +돜 +돝 +돞 +돟 +돠 +돡 +돢 +돣 +돤 +돥 +돦 +돧 +돨 +돩 +돪 +돫 +돬 +돭 +돮 +돯 +돰 +돱 +돲 +돳 +돴 +돵 +돶 +돷 +돸 +돹 +돺 +돻 +돼 +돽 +돾 +돿 +됀 +됁 +됂 +됃 +됄 +됅 +됆 +됇 +됈 +됉 +됊 +됋 +됌 +됍 +됎 +됏 +됐 +됑 +됒 +됓 +됔 +됕 +됖 +됗 +되 +됙 +됚 +됛 +된 +됝 +됞 +됟 +될 +됡 +됢 +됣 +됤 +됥 +됦 +됧 +됨 +됩 +됪 +됫 +됬 +됭 +됮 +됯 +됰 +됱 +됲 +됳 +됴 +됵 +됶 +됷 +됸 +됹 +됺 +됻 +됼 +됽 +됾 +됿 +둀 +둁 +둂 +둃 +둄 +둅 +둆 +둇 +둈 +둉 +둊 +둋 +둌 +둍 +둎 +둏 +두 +둑 +둒 +둓 +둔 +둕 +둖 +둗 +둘 +둙 +둚 +둛 +둜 +둝 +둞 +둟 +둠 +둡 +둢 +둣 +둤 +둥 +둦 +둧 +둨 +둩 +둪 +둫 +둬 +둭 +둮 +둯 +둰 +둱 +둲 +둳 +둴 +둵 +둶 +둷 +둸 +둹 +둺 +둻 +둼 +둽 +둾 +둿 +뒀 +뒁 +뒂 +뒃 +뒄 +뒅 +뒆 +뒇 +뒈 +뒉 +뒊 +뒋 +뒌 +뒍 +뒎 +뒏 +뒐 +뒑 +뒒 +뒓 +뒔 +뒕 +뒖 +뒗 +뒘 +뒙 +뒚 +뒛 +뒜 +뒝 +뒞 +뒟 +뒠 +뒡 +뒢 +뒣 +뒤 +뒥 +뒦 +뒧 +뒨 +뒩 +뒪 +뒫 +뒬 +뒭 +뒮 +뒯 +뒰 +뒱 +뒲 +뒳 +뒴 +뒵 +뒶 +뒷 +뒸 +뒹 +뒺 +뒻 +뒼 +뒽 +뒾 +뒿 +듀 +듁 +듂 +듃 +듄 +듅 +듆 +듇 +듈 +듉 +듊 +듋 +듌 +듍 +듎 +듏 +듐 +듑 +듒 +듓 +듔 +듕 +듖 +듗 +듘 +듙 +듚 +듛 +드 +득 +듞 +듟 +든 +듡 +듢 +듣 +들 +듥 +듦 +듧 +듨 +듩 +듪 +듫 +듬 +듭 +듮 +듯 +듰 +등 +듲 +듳 +듴 +듵 +듶 +듷 +듸 +듹 +듺 +듻 +듼 +듽 +듾 +듿 +딀 +딁 +딂 +딃 +딄 +딅 +딆 +딇 +딈 +딉 +딊 +딋 +딌 +딍 +딎 +딏 +딐 +딑 +딒 +딓 +디 +딕 +딖 +딗 +딘 +딙 +딚 +딛 +딜 +딝 +딞 +딟 +딠 +딡 +딢 +딣 +딤 +딥 +딦 +딧 +딨 +딩 +딪 +딫 +딬 +딭 +딮 +딯 +따 +딱 +딲 +딳 +딴 +딵 +딶 +딷 +딸 +딹 +딺 +딻 +딼 +딽 +딾 +딿 +땀 +땁 +땂 +땃 +땄 +땅 +땆 +땇 +땈 +땉 +땊 +땋 +때 +땍 +땎 +땏 +땐 +땑 +땒 +땓 +땔 +땕 +땖 +땗 +땘 +땙 +땚 +땛 +땜 +땝 +땞 +땟 +땠 +땡 +땢 +땣 +땤 +땥 +땦 +땧 +땨 +땩 +땪 +땫 +땬 +땭 +땮 +땯 +땰 +땱 +땲 +땳 +땴 +땵 +땶 +땷 +땸 +땹 +땺 +땻 +땼 +땽 +땾 +땿 +떀 +떁 +떂 +떃 +떄 +떅 +떆 +떇 +떈 +떉 +떊 +떋 +떌 +떍 +떎 +떏 +떐 +떑 +떒 +떓 +떔 +떕 +떖 +떗 +떘 +떙 +떚 +떛 +떜 +떝 +떞 +떟 +떠 +떡 +떢 +떣 +떤 +떥 +떦 +떧 +떨 +떩 +떪 +떫 +떬 +떭 +떮 +떯 +떰 +떱 +떲 +떳 +떴 +떵 +떶 +떷 +떸 +떹 +떺 +떻 +떼 +떽 +떾 +떿 +뗀 +뗁 +뗂 +뗃 +뗄 +뗅 +뗆 +뗇 +뗈 +뗉 +뗊 +뗋 +뗌 +뗍 +뗎 +뗏 +뗐 +뗑 +뗒 +뗓 +뗔 +뗕 +뗖 +뗗 +뗘 +뗙 +뗚 +뗛 +뗜 +뗝 +뗞 +뗟 +뗠 +뗡 +뗢 +뗣 +뗤 +뗥 +뗦 +뗧 +뗨 +뗩 +뗪 +뗫 +뗬 +뗭 +뗮 +뗯 +뗰 +뗱 +뗲 +뗳 +뗴 +뗵 +뗶 +뗷 +뗸 +뗹 +뗺 +뗻 +뗼 +뗽 +뗾 +뗿 +똀 +똁 +똂 +똃 +똄 +똅 +똆 +똇 +똈 +똉 +똊 +똋 +똌 +똍 +똎 +똏 +또 +똑 +똒 +똓 +똔 +똕 +똖 +똗 +똘 +똙 +똚 +똛 +똜 +똝 +똞 +똟 +똠 +똡 +똢 +똣 +똤 +똥 +똦 +똧 +똨 +똩 +똪 +똫 +똬 +똭 +똮 +똯 +똰 +똱 +똲 +똳 +똴 +똵 +똶 +똷 +똸 +똹 +똺 +똻 +똼 +똽 +똾 +똿 +뙀 +뙁 +뙂 +뙃 +뙄 +뙅 +뙆 +뙇 +뙈 +뙉 +뙊 +뙋 +뙌 +뙍 +뙎 +뙏 +뙐 +뙑 +뙒 +뙓 +뙔 +뙕 +뙖 +뙗 +뙘 +뙙 +뙚 +뙛 +뙜 +뙝 +뙞 +뙟 +뙠 +뙡 +뙢 +뙣 +뙤 +뙥 +뙦 +뙧 +뙨 +뙩 +뙪 +뙫 +뙬 +뙭 +뙮 +뙯 +뙰 +뙱 +뙲 +뙳 +뙴 +뙵 +뙶 +뙷 +뙸 +뙹 +뙺 +뙻 +뙼 +뙽 +뙾 +뙿 +뚀 +뚁 +뚂 +뚃 +뚄 +뚅 +뚆 +뚇 +뚈 +뚉 +뚊 +뚋 +뚌 +뚍 +뚎 +뚏 +뚐 +뚑 +뚒 +뚓 +뚔 +뚕 +뚖 +뚗 +뚘 +뚙 +뚚 +뚛 +뚜 +뚝 +뚞 +뚟 +뚠 +뚡 +뚢 +뚣 +뚤 +뚥 +뚦 +뚧 +뚨 +뚩 +뚪 +뚫 +뚬 +뚭 +뚮 +뚯 +뚰 +뚱 +뚲 +뚳 +뚴 +뚵 +뚶 +뚷 +뚸 +뚹 +뚺 +뚻 +뚼 +뚽 +뚾 +뚿 +뛀 +뛁 +뛂 +뛃 +뛄 +뛅 +뛆 +뛇 +뛈 +뛉 +뛊 +뛋 +뛌 +뛍 +뛎 +뛏 +뛐 +뛑 +뛒 +뛓 +뛔 +뛕 +뛖 +뛗 +뛘 +뛙 +뛚 +뛛 +뛜 +뛝 +뛞 +뛟 +뛠 +뛡 +뛢 +뛣 +뛤 +뛥 +뛦 +뛧 +뛨 +뛩 +뛪 +뛫 +뛬 +뛭 +뛮 +뛯 +뛰 +뛱 +뛲 +뛳 +뛴 +뛵 +뛶 +뛷 +뛸 +뛹 +뛺 +뛻 +뛼 +뛽 +뛾 +뛿 +뜀 +뜁 +뜂 +뜃 +뜄 +뜅 +뜆 +뜇 +뜈 +뜉 +뜊 +뜋 +뜌 +뜍 +뜎 +뜏 +뜐 +뜑 +뜒 +뜓 +뜔 +뜕 +뜖 +뜗 +뜘 +뜙 +뜚 +뜛 +뜜 +뜝 +뜞 +뜟 +뜠 +뜡 +뜢 +뜣 +뜤 +뜥 +뜦 +뜧 +뜨 +뜩 +뜪 +뜫 +뜬 +뜭 +뜮 +뜯 +뜰 +뜱 +뜲 +뜳 +뜴 +뜵 +뜶 +뜷 +뜸 +뜹 +뜺 +뜻 +뜼 +뜽 +뜾 +뜿 +띀 +띁 +띂 +띃 +띄 +띅 +띆 +띇 +띈 +띉 +띊 +띋 +띌 +띍 +띎 +띏 +띐 +띑 +띒 +띓 +띔 +띕 +띖 +띗 +띘 +띙 +띚 +띛 +띜 +띝 +띞 +띟 +띠 +띡 +띢 +띣 +띤 +띥 +띦 +띧 +띨 +띩 +띪 +띫 +띬 +띭 +띮 +띯 +띰 +띱 +띲 +띳 +띴 +띵 +띶 +띷 +띸 +띹 +띺 +띻 +라 +락 +띾 +띿 +란 +랁 +랂 +랃 +랄 +랅 +랆 +랇 +랈 +랉 +랊 +랋 +람 +랍 +랎 +랏 +랐 +랑 +랒 +랓 +랔 +랕 +랖 +랗 +래 +랙 +랚 +랛 +랜 +랝 +랞 +랟 +랠 +랡 +랢 +랣 +랤 +랥 +랦 +랧 +램 +랩 +랪 +랫 +랬 +랭 +랮 +랯 +랰 +랱 +랲 +랳 +랴 +략 +랶 +랷 +랸 +랹 +랺 +랻 +랼 +랽 +랾 +랿 +럀 +럁 +럂 +럃 +럄 +럅 +럆 +럇 +럈 +량 +럊 +럋 +럌 +럍 +럎 +럏 +럐 +럑 +럒 +럓 +럔 +럕 +럖 +럗 +럘 +럙 +럚 +럛 +럜 +럝 +럞 +럟 +럠 +럡 +럢 +럣 +럤 +럥 +럦 +럧 +럨 +럩 +럪 +럫 +러 +럭 +럮 +럯 +런 +럱 +럲 +럳 +럴 +럵 +럶 +럷 +럸 +럹 +럺 +럻 +럼 +럽 +럾 +럿 +렀 +렁 +렂 +렃 +렄 +렅 +렆 +렇 +레 +렉 +렊 +렋 +렌 +렍 +렎 +렏 +렐 +렑 +렒 +렓 +렔 +렕 +렖 +렗 +렘 +렙 +렚 +렛 +렜 +렝 +렞 +렟 +렠 +렡 +렢 +렣 +려 +력 +렦 +렧 +련 +렩 +렪 +렫 +렬 +렭 +렮 +렯 +렰 +렱 +렲 +렳 +렴 +렵 +렶 +렷 +렸 +령 +렺 +렻 +렼 +렽 +렾 +렿 +례 +롁 +롂 +롃 +롄 +롅 +롆 +롇 +롈 +롉 +롊 +롋 +롌 +롍 +롎 +롏 +롐 +롑 +롒 +롓 +롔 +롕 +롖 +롗 +롘 +롙 +롚 +롛 +로 +록 +롞 +롟 +론 +롡 +롢 +롣 +롤 +롥 +롦 +롧 +롨 +롩 +롪 +롫 +롬 +롭 +롮 +롯 +롰 +롱 +롲 +롳 +롴 +롵 +롶 +롷 +롸 +롹 +롺 +롻 +롼 +롽 +롾 +롿 +뢀 +뢁 +뢂 +뢃 +뢄 +뢅 +뢆 +뢇 +뢈 +뢉 +뢊 +뢋 +뢌 +뢍 +뢎 +뢏 +뢐 +뢑 +뢒 +뢓 +뢔 +뢕 +뢖 +뢗 +뢘 +뢙 +뢚 +뢛 +뢜 +뢝 +뢞 +뢟 +뢠 +뢡 +뢢 +뢣 +뢤 +뢥 +뢦 +뢧 +뢨 +뢩 +뢪 +뢫 +뢬 +뢭 +뢮 +뢯 +뢰 +뢱 +뢲 +뢳 +뢴 +뢵 +뢶 +뢷 +뢸 +뢹 +뢺 +뢻 +뢼 +뢽 +뢾 +뢿 +룀 +룁 +룂 +룃 +룄 +룅 +룆 +룇 +룈 +룉 +룊 +룋 +료 +룍 +룎 +룏 +룐 +룑 +룒 +룓 +룔 +룕 +룖 +룗 +룘 +룙 +룚 +룛 +룜 +룝 +룞 +룟 +룠 +룡 +룢 +룣 +룤 +룥 +룦 +룧 +루 +룩 +룪 +룫 +룬 +룭 +룮 +룯 +룰 +룱 +룲 +룳 +룴 +룵 +룶 +룷 +룸 +룹 +룺 +룻 +룼 +룽 +룾 +룿 +뤀 +뤁 +뤂 +뤃 +뤄 +뤅 +뤆 +뤇 +뤈 +뤉 +뤊 +뤋 +뤌 +뤍 +뤎 +뤏 +뤐 +뤑 +뤒 +뤓 +뤔 +뤕 +뤖 +뤗 +뤘 +뤙 +뤚 +뤛 +뤜 +뤝 +뤞 +뤟 +뤠 +뤡 +뤢 +뤣 +뤤 +뤥 +뤦 +뤧 +뤨 +뤩 +뤪 +뤫 +뤬 +뤭 +뤮 +뤯 +뤰 +뤱 +뤲 +뤳 +뤴 +뤵 +뤶 +뤷 +뤸 +뤹 +뤺 +뤻 +뤼 +뤽 +뤾 +뤿 +륀 +륁 +륂 +륃 +륄 +륅 +륆 +륇 +륈 +륉 +륊 +륋 +륌 +륍 +륎 +륏 +륐 +륑 +륒 +륓 +륔 +륕 +륖 +륗 +류 +륙 +륚 +륛 +륜 +륝 +륞 +륟 +률 +륡 +륢 +륣 +륤 +륥 +륦 +륧 +륨 +륩 +륪 +륫 +륬 +륭 +륮 +륯 +륰 +륱 +륲 +륳 +르 +륵 +륶 +륷 +른 +륹 +륺 +륻 +를 +륽 +륾 +륿 +릀 +릁 +릂 +릃 +름 +릅 +릆 +릇 +릈 +릉 +릊 +릋 +릌 +릍 +릎 +릏 +릐 +릑 +릒 +릓 +릔 +릕 +릖 +릗 +릘 +릙 +릚 +릛 +릜 +릝 +릞 +릟 +릠 +릡 +릢 +릣 +릤 +릥 +릦 +릧 +릨 +릩 +릪 +릫 +리 +릭 +릮 +릯 +린 +릱 +릲 +릳 +릴 +릵 +릶 +릷 +릸 +릹 +릺 +릻 +림 +립 +릾 +릿 +맀 +링 +맂 +맃 +맄 +맅 +맆 +맇 +마 +막 +맊 +맋 +만 +맍 +많 +맏 +말 +맑 +맒 +맓 +맔 +맕 +맖 +맗 +맘 +맙 +맚 +맛 +맜 +망 +맞 +맟 +맠 +맡 +맢 +맣 +매 +맥 +맦 +맧 +맨 +맩 +맪 +맫 +맬 +맭 +맮 +맯 +맰 +맱 +맲 +맳 +맴 +맵 +맶 +맷 +맸 +맹 +맺 +맻 +맼 +맽 +맾 +맿 +먀 +먁 +먂 +먃 +먄 +먅 +먆 +먇 +먈 +먉 +먊 +먋 +먌 +먍 +먎 +먏 +먐 +먑 +먒 +먓 +먔 +먕 +먖 +먗 +먘 +먙 +먚 +먛 +먜 +먝 +먞 +먟 +먠 +먡 +먢 +먣 +먤 +먥 +먦 +먧 +먨 +먩 +먪 +먫 +먬 +먭 +먮 +먯 +먰 +먱 +먲 +먳 +먴 +먵 +먶 +먷 +머 +먹 +먺 +먻 +먼 +먽 +먾 +먿 +멀 +멁 +멂 +멃 +멄 +멅 +멆 +멇 +멈 +멉 +멊 +멋 +멌 +멍 +멎 +멏 +멐 +멑 +멒 +멓 +메 +멕 +멖 +멗 +멘 +멙 +멚 +멛 +멜 +멝 +멞 +멟 +멠 +멡 +멢 +멣 +멤 +멥 +멦 +멧 +멨 +멩 +멪 +멫 +멬 +멭 +멮 +멯 +며 +멱 +멲 +멳 +면 +멵 +멶 +멷 +멸 +멹 +멺 +멻 +멼 +멽 +멾 +멿 +몀 +몁 +몂 +몃 +몄 +명 +몆 +몇 +몈 +몉 +몊 +몋 +몌 +몍 +몎 +몏 +몐 +몑 +몒 +몓 +몔 +몕 +몖 +몗 +몘 +몙 +몚 +몛 +몜 +몝 +몞 +몟 +몠 +몡 +몢 +몣 +몤 +몥 +몦 +몧 +모 +목 +몪 +몫 +몬 +몭 +몮 +몯 +몰 +몱 +몲 +몳 +몴 +몵 +몶 +몷 +몸 +몹 +몺 +못 +몼 +몽 +몾 +몿 +뫀 +뫁 +뫂 +뫃 +뫄 +뫅 +뫆 +뫇 +뫈 +뫉 +뫊 +뫋 +뫌 +뫍 +뫎 +뫏 +뫐 +뫑 +뫒 +뫓 +뫔 +뫕 +뫖 +뫗 +뫘 +뫙 +뫚 +뫛 +뫜 +뫝 +뫞 +뫟 +뫠 +뫡 +뫢 +뫣 +뫤 +뫥 +뫦 +뫧 +뫨 +뫩 +뫪 +뫫 +뫬 +뫭 +뫮 +뫯 +뫰 +뫱 +뫲 +뫳 +뫴 +뫵 +뫶 +뫷 +뫸 +뫹 +뫺 +뫻 +뫼 +뫽 +뫾 +뫿 +묀 +묁 +묂 +묃 +묄 +묅 +묆 +묇 +묈 +묉 +묊 +묋 +묌 +묍 +묎 +묏 +묐 +묑 +묒 +묓 +묔 +묕 +묖 +묗 +묘 +묙 +묚 +묛 +묜 +묝 +묞 +묟 +묠 +묡 +묢 +묣 +묤 +묥 +묦 +묧 +묨 +묩 +묪 +묫 +묬 +묭 +묮 +묯 +묰 +묱 +묲 +묳 +무 +묵 +묶 +묷 +문 +묹 +묺 +묻 +물 +묽 +묾 +묿 +뭀 +뭁 +뭂 +뭃 +뭄 +뭅 +뭆 +뭇 +뭈 +뭉 +뭊 +뭋 +뭌 +뭍 +뭎 +뭏 +뭐 +뭑 +뭒 +뭓 +뭔 +뭕 +뭖 +뭗 +뭘 +뭙 +뭚 +뭛 +뭜 +뭝 +뭞 +뭟 +뭠 +뭡 +뭢 +뭣 +뭤 +뭥 +뭦 +뭧 +뭨 +뭩 +뭪 +뭫 +뭬 +뭭 +뭮 +뭯 +뭰 +뭱 +뭲 +뭳 +뭴 +뭵 +뭶 +뭷 +뭸 +뭹 +뭺 +뭻 +뭼 +뭽 +뭾 +뭿 +뮀 +뮁 +뮂 +뮃 +뮄 +뮅 +뮆 +뮇 +뮈 +뮉 +뮊 +뮋 +뮌 +뮍 +뮎 +뮏 +뮐 +뮑 +뮒 +뮓 +뮔 +뮕 +뮖 +뮗 +뮘 +뮙 +뮚 +뮛 +뮜 +뮝 +뮞 +뮟 +뮠 +뮡 +뮢 +뮣 +뮤 +뮥 +뮦 +뮧 +뮨 +뮩 +뮪 +뮫 +뮬 +뮭 +뮮 +뮯 +뮰 +뮱 +뮲 +뮳 +뮴 +뮵 +뮶 +뮷 +뮸 +뮹 +뮺 +뮻 +뮼 +뮽 +뮾 +뮿 +므 +믁 +믂 +믃 +믄 +믅 +믆 +믇 +믈 +믉 +믊 +믋 +믌 +믍 +믎 +믏 +믐 +믑 +믒 +믓 +믔 +믕 +믖 +믗 +믘 +믙 +믚 +믛 +믜 +믝 +믞 +믟 +믠 +믡 +믢 +믣 +믤 +믥 +믦 +믧 +믨 +믩 +믪 +믫 +믬 +믭 +믮 +믯 +믰 +믱 +믲 +믳 +믴 +믵 +믶 +믷 +미 +믹 +믺 +믻 +민 +믽 +믾 +믿 +밀 +밁 +밂 +밃 +밄 +밅 +밆 +밇 +밈 +밉 +밊 +밋 +밌 +밍 +밎 +및 +밐 +밑 +밒 +밓 +바 +박 +밖 +밗 +반 +밙 +밚 +받 +발 +밝 +밞 +밟 +밠 +밡 +밢 +밣 +밤 +밥 +밦 +밧 +밨 +방 +밪 +밫 +밬 +밭 +밮 +밯 +배 +백 +밲 +밳 +밴 +밵 +밶 +밷 +밸 +밹 +밺 +밻 +밼 +밽 +밾 +밿 +뱀 +뱁 +뱂 +뱃 +뱄 +뱅 +뱆 +뱇 +뱈 +뱉 +뱊 +뱋 +뱌 +뱍 +뱎 +뱏 +뱐 +뱑 +뱒 +뱓 +뱔 +뱕 +뱖 +뱗 +뱘 +뱙 +뱚 +뱛 +뱜 +뱝 +뱞 +뱟 +뱠 +뱡 +뱢 +뱣 +뱤 +뱥 +뱦 +뱧 +뱨 +뱩 +뱪 +뱫 +뱬 +뱭 +뱮 +뱯 +뱰 +뱱 +뱲 +뱳 +뱴 +뱵 +뱶 +뱷 +뱸 +뱹 +뱺 +뱻 +뱼 +뱽 +뱾 +뱿 +벀 +벁 +벂 +벃 +버 +벅 +벆 +벇 +번 +벉 +벊 +벋 +벌 +벍 +벎 +벏 +벐 +벑 +벒 +벓 +범 +법 +벖 +벗 +벘 +벙 +벚 +벛 +벜 +벝 +벞 +벟 +베 +벡 +벢 +벣 +벤 +벥 +벦 +벧 +벨 +벩 +벪 +벫 +벬 +벭 +벮 +벯 +벰 +벱 +벲 +벳 +벴 +벵 +벶 +벷 +벸 +벹 +벺 +벻 +벼 +벽 +벾 +벿 +변 +볁 +볂 +볃 +별 +볅 +볆 +볇 +볈 +볉 +볊 +볋 +볌 +볍 +볎 +볏 +볐 +병 +볒 +볓 +볔 +볕 +볖 +볗 +볘 +볙 +볚 +볛 +볜 +볝 +볞 +볟 +볠 +볡 +볢 +볣 +볤 +볥 +볦 +볧 +볨 +볩 +볪 +볫 +볬 +볭 +볮 +볯 +볰 +볱 +볲 +볳 +보 +복 +볶 +볷 +본 +볹 +볺 +볻 +볼 +볽 +볾 +볿 +봀 +봁 +봂 +봃 +봄 +봅 +봆 +봇 +봈 +봉 +봊 +봋 +봌 +봍 +봎 +봏 +봐 +봑 +봒 +봓 +봔 +봕 +봖 +봗 +봘 +봙 +봚 +봛 +봜 +봝 +봞 +봟 +봠 +봡 +봢 +봣 +봤 +봥 +봦 +봧 +봨 +봩 +봪 +봫 +봬 +봭 +봮 +봯 +봰 +봱 +봲 +봳 +봴 +봵 +봶 +봷 +봸 +봹 +봺 +봻 +봼 +봽 +봾 +봿 +뵀 +뵁 +뵂 +뵃 +뵄 +뵅 +뵆 +뵇 +뵈 +뵉 +뵊 +뵋 +뵌 +뵍 +뵎 +뵏 +뵐 +뵑 +뵒 +뵓 +뵔 +뵕 +뵖 +뵗 +뵘 +뵙 +뵚 +뵛 +뵜 +뵝 +뵞 +뵟 +뵠 +뵡 +뵢 +뵣 +뵤 +뵥 +뵦 +뵧 +뵨 +뵩 +뵪 +뵫 +뵬 +뵭 +뵮 +뵯 +뵰 +뵱 +뵲 +뵳 +뵴 +뵵 +뵶 +뵷 +뵸 +뵹 +뵺 +뵻 +뵼 +뵽 +뵾 +뵿 +부 +북 +붂 +붃 +분 +붅 +붆 +붇 +불 +붉 +붊 +붋 +붌 +붍 +붎 +붏 +붐 +붑 +붒 +붓 +붔 +붕 +붖 +붗 +붘 +붙 +붚 +붛 +붜 +붝 +붞 +붟 +붠 +붡 +붢 +붣 +붤 +붥 +붦 +붧 +붨 +붩 +붪 +붫 +붬 +붭 +붮 +붯 +붰 +붱 +붲 +붳 +붴 +붵 +붶 +붷 +붸 +붹 +붺 +붻 +붼 +붽 +붾 +붿 +뷀 +뷁 +뷂 +뷃 +뷄 +뷅 +뷆 +뷇 +뷈 +뷉 +뷊 +뷋 +뷌 +뷍 +뷎 +뷏 +뷐 +뷑 +뷒 +뷓 +뷔 +뷕 +뷖 +뷗 +뷘 +뷙 +뷚 +뷛 +뷜 +뷝 +뷞 +뷟 +뷠 +뷡 +뷢 +뷣 +뷤 +뷥 +뷦 +뷧 +뷨 +뷩 +뷪 +뷫 +뷬 +뷭 +뷮 +뷯 +뷰 +뷱 +뷲 +뷳 +뷴 +뷵 +뷶 +뷷 +뷸 +뷹 +뷺 +뷻 +뷼 +뷽 +뷾 +뷿 +븀 +븁 +븂 +븃 +븄 +븅 +븆 +븇 +븈 +븉 +븊 +븋 +브 +븍 +븎 +븏 +븐 +븑 +븒 +븓 +블 +븕 +븖 +븗 +븘 +븙 +븚 +븛 +븜 +븝 +븞 +븟 +븠 +븡 +븢 +븣 +븤 +븥 +븦 +븧 +븨 +븩 +븪 +븫 +븬 +븭 +븮 +븯 +븰 +븱 +븲 +븳 +븴 +븵 +븶 +븷 +븸 +븹 +븺 +븻 +븼 +븽 +븾 +븿 +빀 +빁 +빂 +빃 +비 +빅 +빆 +빇 +빈 +빉 +빊 +빋 +빌 +빍 +빎 +빏 +빐 +빑 +빒 +빓 +빔 +빕 +빖 +빗 +빘 +빙 +빚 +빛 +빜 +빝 +빞 +빟 +빠 +빡 +빢 +빣 +빤 +빥 +빦 +빧 +빨 +빩 +빪 +빫 +빬 +빭 +빮 +빯 +빰 +빱 +빲 +빳 +빴 +빵 +빶 +빷 +빸 +빹 +빺 +빻 +빼 +빽 +빾 +빿 +뺀 +뺁 +뺂 +뺃 +뺄 +뺅 +뺆 +뺇 +뺈 +뺉 +뺊 +뺋 +뺌 +뺍 +뺎 +뺏 +뺐 +뺑 +뺒 +뺓 +뺔 +뺕 +뺖 +뺗 +뺘 +뺙 +뺚 +뺛 +뺜 +뺝 +뺞 +뺟 +뺠 +뺡 +뺢 +뺣 +뺤 +뺥 +뺦 +뺧 +뺨 +뺩 +뺪 +뺫 +뺬 +뺭 +뺮 +뺯 +뺰 +뺱 +뺲 +뺳 +뺴 +뺵 +뺶 +뺷 +뺸 +뺹 +뺺 +뺻 +뺼 +뺽 +뺾 +뺿 +뻀 +뻁 +뻂 +뻃 +뻄 +뻅 +뻆 +뻇 +뻈 +뻉 +뻊 +뻋 +뻌 +뻍 +뻎 +뻏 +뻐 +뻑 +뻒 +뻓 +뻔 +뻕 +뻖 +뻗 +뻘 +뻙 +뻚 +뻛 +뻜 +뻝 +뻞 +뻟 +뻠 +뻡 +뻢 +뻣 +뻤 +뻥 +뻦 +뻧 +뻨 +뻩 +뻪 +뻫 +뻬 +뻭 +뻮 +뻯 +뻰 +뻱 +뻲 +뻳 +뻴 +뻵 +뻶 +뻷 +뻸 +뻹 +뻺 +뻻 +뻼 +뻽 +뻾 +뻿 +뼀 +뼁 +뼂 +뼃 +뼄 +뼅 +뼆 +뼇 +뼈 +뼉 +뼊 +뼋 +뼌 +뼍 +뼎 +뼏 +뼐 +뼑 +뼒 +뼓 +뼔 +뼕 +뼖 +뼗 +뼘 +뼙 +뼚 +뼛 +뼜 +뼝 +뼞 +뼟 +뼠 +뼡 +뼢 +뼣 +뼤 +뼥 +뼦 +뼧 +뼨 +뼩 +뼪 +뼫 +뼬 +뼭 +뼮 +뼯 +뼰 +뼱 +뼲 +뼳 +뼴 +뼵 +뼶 +뼷 +뼸 +뼹 +뼺 +뼻 +뼼 +뼽 +뼾 +뼿 +뽀 +뽁 +뽂 +뽃 +뽄 +뽅 +뽆 +뽇 +뽈 +뽉 +뽊 +뽋 +뽌 +뽍 +뽎 +뽏 +뽐 +뽑 +뽒 +뽓 +뽔 +뽕 +뽖 +뽗 +뽘 +뽙 +뽚 +뽛 +뽜 +뽝 +뽞 +뽟 +뽠 +뽡 +뽢 +뽣 +뽤 +뽥 +뽦 +뽧 +뽨 +뽩 +뽪 +뽫 +뽬 +뽭 +뽮 +뽯 +뽰 +뽱 +뽲 +뽳 +뽴 +뽵 +뽶 +뽷 +뽸 +뽹 +뽺 +뽻 +뽼 +뽽 +뽾 +뽿 +뾀 +뾁 +뾂 +뾃 +뾄 +뾅 +뾆 +뾇 +뾈 +뾉 +뾊 +뾋 +뾌 +뾍 +뾎 +뾏 +뾐 +뾑 +뾒 +뾓 +뾔 +뾕 +뾖 +뾗 +뾘 +뾙 +뾚 +뾛 +뾜 +뾝 +뾞 +뾟 +뾠 +뾡 +뾢 +뾣 +뾤 +뾥 +뾦 +뾧 +뾨 +뾩 +뾪 +뾫 +뾬 +뾭 +뾮 +뾯 +뾰 +뾱 +뾲 +뾳 +뾴 +뾵 +뾶 +뾷 +뾸 +뾹 +뾺 +뾻 +뾼 +뾽 +뾾 +뾿 +뿀 +뿁 +뿂 +뿃 +뿄 +뿅 +뿆 +뿇 +뿈 +뿉 +뿊 +뿋 +뿌 +뿍 +뿎 +뿏 +뿐 +뿑 +뿒 +뿓 +뿔 +뿕 +뿖 +뿗 +뿘 +뿙 +뿚 +뿛 +뿜 +뿝 +뿞 +뿟 +뿠 +뿡 +뿢 +뿣 +뿤 +뿥 +뿦 +뿧 +뿨 +뿩 +뿪 +뿫 +뿬 +뿭 +뿮 +뿯 +뿰 +뿱 +뿲 +뿳 +뿴 +뿵 +뿶 +뿷 +뿸 +뿹 +뿺 +뿻 +뿼 +뿽 +뿾 +뿿 +쀀 +쀁 +쀂 +쀃 +쀄 +쀅 +쀆 +쀇 +쀈 +쀉 +쀊 +쀋 +쀌 +쀍 +쀎 +쀏 +쀐 +쀑 +쀒 +쀓 +쀔 +쀕 +쀖 +쀗 +쀘 +쀙 +쀚 +쀛 +쀜 +쀝 +쀞 +쀟 +쀠 +쀡 +쀢 +쀣 +쀤 +쀥 +쀦 +쀧 +쀨 +쀩 +쀪 +쀫 +쀬 +쀭 +쀮 +쀯 +쀰 +쀱 +쀲 +쀳 +쀴 +쀵 +쀶 +쀷 +쀸 +쀹 +쀺 +쀻 +쀼 +쀽 +쀾 +쀿 +쁀 +쁁 +쁂 +쁃 +쁄 +쁅 +쁆 +쁇 +쁈 +쁉 +쁊 +쁋 +쁌 +쁍 +쁎 +쁏 +쁐 +쁑 +쁒 +쁓 +쁔 +쁕 +쁖 +쁗 +쁘 +쁙 +쁚 +쁛 +쁜 +쁝 +쁞 +쁟 +쁠 +쁡 +쁢 +쁣 +쁤 +쁥 +쁦 +쁧 +쁨 +쁩 +쁪 +쁫 +쁬 +쁭 +쁮 +쁯 +쁰 +쁱 +쁲 +쁳 +쁴 +쁵 +쁶 +쁷 +쁸 +쁹 +쁺 +쁻 +쁼 +쁽 +쁾 +쁿 +삀 +삁 +삂 +삃 +삄 +삅 +삆 +삇 +삈 +삉 +삊 +삋 +삌 +삍 +삎 +삏 +삐 +삑 +삒 +삓 +삔 +삕 +삖 +삗 +삘 +삙 +삚 +삛 +삜 +삝 +삞 +삟 +삠 +삡 +삢 +삣 +삤 +삥 +삦 +삧 +삨 +삩 +삪 +삫 +사 +삭 +삮 +삯 +산 +삱 +삲 +삳 +살 +삵 +삶 +삷 +삸 +삹 +삺 +삻 +삼 +삽 +삾 +삿 +샀 +상 +샂 +샃 +샄 +샅 +샆 +샇 +새 +색 +샊 +샋 +샌 +샍 +샎 +샏 +샐 +샑 +샒 +샓 +샔 +샕 +샖 +샗 +샘 +샙 +샚 +샛 +샜 +생 +샞 +샟 +샠 +샡 +샢 +샣 +샤 +샥 +샦 +샧 +샨 +샩 +샪 +샫 +샬 +샭 +샮 +샯 +샰 +샱 +샲 +샳 +샴 +샵 +샶 +샷 +샸 +샹 +샺 +샻 +샼 +샽 +샾 +샿 +섀 +섁 +섂 +섃 +섄 +섅 +섆 +섇 +섈 +섉 +섊 +섋 +섌 +섍 +섎 +섏 +섐 +섑 +섒 +섓 +섔 +섕 +섖 +섗 +섘 +섙 +섚 +섛 +서 +석 +섞 +섟 +선 +섡 +섢 +섣 +설 +섥 +섦 +섧 +섨 +섩 +섪 +섫 +섬 +섭 +섮 +섯 +섰 +성 +섲 +섳 +섴 +섵 +섶 +섷 +세 +섹 +섺 +섻 +센 +섽 +섾 +섿 +셀 +셁 +셂 +셃 +셄 +셅 +셆 +셇 +셈 +셉 +셊 +셋 +셌 +셍 +셎 +셏 +셐 +셑 +셒 +셓 +셔 +셕 +셖 +셗 +션 +셙 +셚 +셛 +셜 +셝 +셞 +셟 +셠 +셡 +셢 +셣 +셤 +셥 +셦 +셧 +셨 +셩 +셪 +셫 +셬 +셭 +셮 +셯 +셰 +셱 +셲 +셳 +셴 +셵 +셶 +셷 +셸 +셹 +셺 +셻 +셼 +셽 +셾 +셿 +솀 +솁 +솂 +솃 +솄 +솅 +솆 +솇 +솈 +솉 +솊 +솋 +소 +속 +솎 +솏 +손 +솑 +솒 +솓 +솔 +솕 +솖 +솗 +솘 +솙 +솚 +솛 +솜 +솝 +솞 +솟 +솠 +송 +솢 +솣 +솤 +솥 +솦 +솧 +솨 +솩 +솪 +솫 +솬 +솭 +솮 +솯 +솰 +솱 +솲 +솳 +솴 +솵 +솶 +솷 +솸 +솹 +솺 +솻 +솼 +솽 +솾 +솿 +쇀 +쇁 +쇂 +쇃 +쇄 +쇅 +쇆 +쇇 +쇈 +쇉 +쇊 +쇋 +쇌 +쇍 +쇎 +쇏 +쇐 +쇑 +쇒 +쇓 +쇔 +쇕 +쇖 +쇗 +쇘 +쇙 +쇚 +쇛 +쇜 +쇝 +쇞 +쇟 +쇠 +쇡 +쇢 +쇣 +쇤 +쇥 +쇦 +쇧 +쇨 +쇩 +쇪 +쇫 +쇬 +쇭 +쇮 +쇯 +쇰 +쇱 +쇲 +쇳 +쇴 +쇵 +쇶 +쇷 +쇸 +쇹 +쇺 +쇻 +쇼 +쇽 +쇾 +쇿 +숀 +숁 +숂 +숃 +숄 +숅 +숆 +숇 +숈 +숉 +숊 +숋 +숌 +숍 +숎 +숏 +숐 +숑 +숒 +숓 +숔 +숕 +숖 +숗 +수 +숙 +숚 +숛 +순 +숝 +숞 +숟 +술 +숡 +숢 +숣 +숤 +숥 +숦 +숧 +숨 +숩 +숪 +숫 +숬 +숭 +숮 +숯 +숰 +숱 +숲 +숳 +숴 +숵 +숶 +숷 +숸 +숹 +숺 +숻 +숼 +숽 +숾 +숿 +쉀 +쉁 +쉂 +쉃 +쉄 +쉅 +쉆 +쉇 +쉈 +쉉 +쉊 +쉋 +쉌 +쉍 +쉎 +쉏 +쉐 +쉑 +쉒 +쉓 +쉔 +쉕 +쉖 +쉗 +쉘 +쉙 +쉚 +쉛 +쉜 +쉝 +쉞 +쉟 +쉠 +쉡 +쉢 +쉣 +쉤 +쉥 +쉦 +쉧 +쉨 +쉩 +쉪 +쉫 +쉬 +쉭 +쉮 +쉯 +쉰 +쉱 +쉲 +쉳 +쉴 +쉵 +쉶 +쉷 +쉸 +쉹 +쉺 +쉻 +쉼 +쉽 +쉾 +쉿 +슀 +슁 +슂 +슃 +슄 +슅 +슆 +슇 +슈 +슉 +슊 +슋 +슌 +슍 +슎 +슏 +슐 +슑 +슒 +슓 +슔 +슕 +슖 +슗 +슘 +슙 +슚 +슛 +슜 +슝 +슞 +슟 +슠 +슡 +슢 +슣 +스 +슥 +슦 +슧 +슨 +슩 +슪 +슫 +슬 +슭 +슮 +슯 +슰 +슱 +슲 +슳 +슴 +습 +슶 +슷 +슸 +승 +슺 +슻 +슼 +슽 +슾 +슿 +싀 +싁 +싂 +싃 +싄 +싅 +싆 +싇 +싈 +싉 +싊 +싋 +싌 +싍 +싎 +싏 +싐 +싑 +싒 +싓 +싔 +싕 +싖 +싗 +싘 +싙 +싚 +싛 +시 +식 +싞 +싟 +신 +싡 +싢 +싣 +실 +싥 +싦 +싧 +싨 +싩 +싪 +싫 +심 +십 +싮 +싯 +싰 +싱 +싲 +싳 +싴 +싵 +싶 +싷 +싸 +싹 +싺 +싻 +싼 +싽 +싾 +싿 +쌀 +쌁 +쌂 +쌃 +쌄 +쌅 +쌆 +쌇 +쌈 +쌉 +쌊 +쌋 +쌌 +쌍 +쌎 +쌏 +쌐 +쌑 +쌒 +쌓 +쌔 +쌕 +쌖 +쌗 +쌘 +쌙 +쌚 +쌛 +쌜 +쌝 +쌞 +쌟 +쌠 +쌡 +쌢 +쌣 +쌤 +쌥 +쌦 +쌧 +쌨 +쌩 +쌪 +쌫 +쌬 +쌭 +쌮 +쌯 +쌰 +쌱 +쌲 +쌳 +쌴 +쌵 +쌶 +쌷 +쌸 +쌹 +쌺 +쌻 +쌼 +쌽 +쌾 +쌿 +썀 +썁 +썂 +썃 +썄 +썅 +썆 +썇 +썈 +썉 +썊 +썋 +썌 +썍 +썎 +썏 +썐 +썑 +썒 +썓 +썔 +썕 +썖 +썗 +썘 +썙 +썚 +썛 +썜 +썝 +썞 +썟 +썠 +썡 +썢 +썣 +썤 +썥 +썦 +썧 +써 +썩 +썪 +썫 +썬 +썭 +썮 +썯 +썰 +썱 +썲 +썳 +썴 +썵 +썶 +썷 +썸 +썹 +썺 +썻 +썼 +썽 +썾 +썿 +쎀 +쎁 +쎂 +쎃 +쎄 +쎅 +쎆 +쎇 +쎈 +쎉 +쎊 +쎋 +쎌 +쎍 +쎎 +쎏 +쎐 +쎑 +쎒 +쎓 +쎔 +쎕 +쎖 +쎗 +쎘 +쎙 +쎚 +쎛 +쎜 +쎝 +쎞 +쎟 +쎠 +쎡 +쎢 +쎣 +쎤 +쎥 +쎦 +쎧 +쎨 +쎩 +쎪 +쎫 +쎬 +쎭 +쎮 +쎯 +쎰 +쎱 +쎲 +쎳 +쎴 +쎵 +쎶 +쎷 +쎸 +쎹 +쎺 +쎻 +쎼 +쎽 +쎾 +쎿 +쏀 +쏁 +쏂 +쏃 +쏄 +쏅 +쏆 +쏇 +쏈 +쏉 +쏊 +쏋 +쏌 +쏍 +쏎 +쏏 +쏐 +쏑 +쏒 +쏓 +쏔 +쏕 +쏖 +쏗 +쏘 +쏙 +쏚 +쏛 +쏜 +쏝 +쏞 +쏟 +쏠 +쏡 +쏢 +쏣 +쏤 +쏥 +쏦 +쏧 +쏨 +쏩 +쏪 +쏫 +쏬 +쏭 +쏮 +쏯 +쏰 +쏱 +쏲 +쏳 +쏴 +쏵 +쏶 +쏷 +쏸 +쏹 +쏺 +쏻 +쏼 +쏽 +쏾 +쏿 +쐀 +쐁 +쐂 +쐃 +쐄 +쐅 +쐆 +쐇 +쐈 +쐉 +쐊 +쐋 +쐌 +쐍 +쐎 +쐏 +쐐 +쐑 +쐒 +쐓 +쐔 +쐕 +쐖 +쐗 +쐘 +쐙 +쐚 +쐛 +쐜 +쐝 +쐞 +쐟 +쐠 +쐡 +쐢 +쐣 +쐤 +쐥 +쐦 +쐧 +쐨 +쐩 +쐪 +쐫 +쐬 +쐭 +쐮 +쐯 +쐰 +쐱 +쐲 +쐳 +쐴 +쐵 +쐶 +쐷 +쐸 +쐹 +쐺 +쐻 +쐼 +쐽 +쐾 +쐿 +쑀 +쑁 +쑂 +쑃 +쑄 +쑅 +쑆 +쑇 +쑈 +쑉 +쑊 +쑋 +쑌 +쑍 +쑎 +쑏 +쑐 +쑑 +쑒 +쑓 +쑔 +쑕 +쑖 +쑗 +쑘 +쑙 +쑚 +쑛 +쑜 +쑝 +쑞 +쑟 +쑠 +쑡 +쑢 +쑣 +쑤 +쑥 +쑦 +쑧 +쑨 +쑩 +쑪 +쑫 +쑬 +쑭 +쑮 +쑯 +쑰 +쑱 +쑲 +쑳 +쑴 +쑵 +쑶 +쑷 +쑸 +쑹 +쑺 +쑻 +쑼 +쑽 +쑾 +쑿 +쒀 +쒁 +쒂 +쒃 +쒄 +쒅 +쒆 +쒇 +쒈 +쒉 +쒊 +쒋 +쒌 +쒍 +쒎 +쒏 +쒐 +쒑 +쒒 +쒓 +쒔 +쒕 +쒖 +쒗 +쒘 +쒙 +쒚 +쒛 +쒜 +쒝 +쒞 +쒟 +쒠 +쒡 +쒢 +쒣 +쒤 +쒥 +쒦 +쒧 +쒨 +쒩 +쒪 +쒫 +쒬 +쒭 +쒮 +쒯 +쒰 +쒱 +쒲 +쒳 +쒴 +쒵 +쒶 +쒷 +쒸 +쒹 +쒺 +쒻 +쒼 +쒽 +쒾 +쒿 +쓀 +쓁 +쓂 +쓃 +쓄 +쓅 +쓆 +쓇 +쓈 +쓉 +쓊 +쓋 +쓌 +쓍 +쓎 +쓏 +쓐 +쓑 +쓒 +쓓 +쓔 +쓕 +쓖 +쓗 +쓘 +쓙 +쓚 +쓛 +쓜 +쓝 +쓞 +쓟 +쓠 +쓡 +쓢 +쓣 +쓤 +쓥 +쓦 +쓧 +쓨 +쓩 +쓪 +쓫 +쓬 +쓭 +쓮 +쓯 +쓰 +쓱 +쓲 +쓳 +쓴 +쓵 +쓶 +쓷 +쓸 +쓹 +쓺 +쓻 +쓼 +쓽 +쓾 +쓿 +씀 +씁 +씂 +씃 +씄 +씅 +씆 +씇 +씈 +씉 +씊 +씋 +씌 +씍 +씎 +씏 +씐 +씑 +씒 +씓 +씔 +씕 +씖 +씗 +씘 +씙 +씚 +씛 +씜 +씝 +씞 +씟 +씠 +씡 +씢 +씣 +씤 +씥 +씦 +씧 +씨 +씩 +씪 +씫 +씬 +씭 +씮 +씯 +씰 +씱 +씲 +씳 +씴 +씵 +씶 +씷 +씸 +씹 +씺 +씻 +씼 +씽 +씾 +씿 +앀 +앁 +앂 +앃 +아 +악 +앆 +앇 +안 +앉 +않 +앋 +알 +앍 +앎 +앏 +앐 +앑 +앒 +앓 +암 +압 +앖 +앗 +았 +앙 +앚 +앛 +앜 +앝 +앞 +앟 +애 +액 +앢 +앣 +앤 +앥 +앦 +앧 +앨 +앩 +앪 +앫 +앬 +앭 +앮 +앯 +앰 +앱 +앲 +앳 +앴 +앵 +앶 +앷 +앸 +앹 +앺 +앻 +야 +약 +앾 +앿 +얀 +얁 +얂 +얃 +얄 +얅 +얆 +얇 +얈 +얉 +얊 +얋 +얌 +얍 +얎 +얏 +얐 +양 +얒 +얓 +얔 +얕 +얖 +얗 +얘 +얙 +얚 +얛 +얜 +얝 +얞 +얟 +얠 +얡 +얢 +얣 +얤 +얥 +얦 +얧 +얨 +얩 +얪 +얫 +얬 +얭 +얮 +얯 +얰 +얱 +얲 +얳 +어 +억 +얶 +얷 +언 +얹 +얺 +얻 +얼 +얽 +얾 +얿 +엀 +엁 +엂 +엃 +엄 +업 +없 +엇 +었 +엉 +엊 +엋 +엌 +엍 +엎 +엏 +에 +엑 +엒 +엓 +엔 +엕 +엖 +엗 +엘 +엙 +엚 +엛 +엜 +엝 +엞 +엟 +엠 +엡 +엢 +엣 +엤 +엥 +엦 +엧 +엨 +엩 +엪 +엫 +여 +역 +엮 +엯 +연 +엱 +엲 +엳 +열 +엵 +엶 +엷 +엸 +엹 +엺 +엻 +염 +엽 +엾 +엿 +였 +영 +옂 +옃 +옄 +옅 +옆 +옇 +예 +옉 +옊 +옋 +옌 +옍 +옎 +옏 +옐 +옑 +옒 +옓 +옔 +옕 +옖 +옗 +옘 +옙 +옚 +옛 +옜 +옝 +옞 +옟 +옠 +옡 +옢 +옣 +오 +옥 +옦 +옧 +온 +옩 +옪 +옫 +올 +옭 +옮 +옯 +옰 +옱 +옲 +옳 +옴 +옵 +옶 +옷 +옸 +옹 +옺 +옻 +옼 +옽 +옾 +옿 +와 +왁 +왂 +왃 +완 +왅 +왆 +왇 +왈 +왉 +왊 +왋 +왌 +왍 +왎 +왏 +왐 +왑 +왒 +왓 +왔 +왕 +왖 +왗 +왘 +왙 +왚 +왛 +왜 +왝 +왞 +왟 +왠 +왡 +왢 +왣 +왤 +왥 +왦 +왧 +왨 +왩 +왪 +왫 +왬 +왭 +왮 +왯 +왰 +왱 +왲 +왳 +왴 +왵 +왶 +왷 +외 +왹 +왺 +왻 +왼 +왽 +왾 +왿 +욀 +욁 +욂 +욃 +욄 +욅 +욆 +욇 +욈 +욉 +욊 +욋 +욌 +욍 +욎 +욏 +욐 +욑 +욒 +욓 +요 +욕 +욖 +욗 +욘 +욙 +욚 +욛 +욜 +욝 +욞 +욟 +욠 +욡 +욢 +욣 +욤 +욥 +욦 +욧 +욨 +용 +욪 +욫 +욬 +욭 +욮 +욯 +우 +욱 +욲 +욳 +운 +욵 +욶 +욷 +울 +욹 +욺 +욻 +욼 +욽 +욾 +욿 +움 +웁 +웂 +웃 +웄 +웅 +웆 +웇 +웈 +웉 +웊 +웋 +워 +웍 +웎 +웏 +원 +웑 +웒 +웓 +월 +웕 +웖 +웗 +웘 +웙 +웚 +웛 +웜 +웝 +웞 +웟 +웠 +웡 +웢 +웣 +웤 +웥 +웦 +웧 +웨 +웩 +웪 +웫 +웬 +웭 +웮 +웯 +웰 +웱 +웲 +웳 +웴 +웵 +웶 +웷 +웸 +웹 +웺 +웻 +웼 +웽 +웾 +웿 +윀 +윁 +윂 +윃 +위 +윅 +윆 +윇 +윈 +윉 +윊 +윋 +윌 +윍 +윎 +윏 +윐 +윑 +윒 +윓 +윔 +윕 +윖 +윗 +윘 +윙 +윚 +윛 +윜 +윝 +윞 +윟 +유 +육 +윢 +윣 +윤 +윥 +윦 +윧 +율 +윩 +윪 +윫 +윬 +윭 +윮 +윯 +윰 +윱 +윲 +윳 +윴 +융 +윶 +윷 +윸 +윹 +윺 +윻 +으 +윽 +윾 +윿 +은 +읁 +읂 +읃 +을 +읅 +읆 +읇 +읈 +읉 +읊 +읋 +음 +읍 +읎 +읏 +읐 +응 +읒 +읓 +읔 +읕 +읖 +읗 +의 +읙 +읚 +읛 +읜 +읝 +읞 +읟 +읠 +읡 +읢 +읣 +읤 +읥 +읦 +읧 +읨 +읩 +읪 +읫 +읬 +읭 +읮 +읯 +읰 +읱 +읲 +읳 +이 +익 +읶 +읷 +인 +읹 +읺 +읻 +일 +읽 +읾 +읿 +잀 +잁 +잂 +잃 +임 +입 +잆 +잇 +있 +잉 +잊 +잋 +잌 +잍 +잎 +잏 +자 +작 +잒 +잓 +잔 +잕 +잖 +잗 +잘 +잙 +잚 +잛 +잜 +잝 +잞 +잟 +잠 +잡 +잢 +잣 +잤 +장 +잦 +잧 +잨 +잩 +잪 +잫 +재 +잭 +잮 +잯 +잰 +잱 +잲 +잳 +잴 +잵 +잶 +잷 +잸 +잹 +잺 +잻 +잼 +잽 +잾 +잿 +쟀 +쟁 +쟂 +쟃 +쟄 +쟅 +쟆 +쟇 +쟈 +쟉 +쟊 +쟋 +쟌 +쟍 +쟎 +쟏 +쟐 +쟑 +쟒 +쟓 +쟔 +쟕 +쟖 +쟗 +쟘 +쟙 +쟚 +쟛 +쟜 +쟝 +쟞 +쟟 +쟠 +쟡 +쟢 +쟣 +쟤 +쟥 +쟦 +쟧 +쟨 +쟩 +쟪 +쟫 +쟬 +쟭 +쟮 +쟯 +쟰 +쟱 +쟲 +쟳 +쟴 +쟵 +쟶 +쟷 +쟸 +쟹 +쟺 +쟻 +쟼 +쟽 +쟾 +쟿 +저 +적 +젂 +젃 +전 +젅 +젆 +젇 +절 +젉 +젊 +젋 +젌 +젍 +젎 +젏 +점 +접 +젒 +젓 +젔 +정 +젖 +젗 +젘 +젙 +젚 +젛 +제 +젝 +젞 +젟 +젠 +젡 +젢 +젣 +젤 +젥 +젦 +젧 +젨 +젩 +젪 +젫 +젬 +젭 +젮 +젯 +젰 +젱 +젲 +젳 +젴 +젵 +젶 +젷 +져 +젹 +젺 +젻 +젼 +젽 +젾 +젿 +졀 +졁 +졂 +졃 +졄 +졅 +졆 +졇 +졈 +졉 +졊 +졋 +졌 +졍 +졎 +졏 +졐 +졑 +졒 +졓 +졔 +졕 +졖 +졗 +졘 +졙 +졚 +졛 +졜 +졝 +졞 +졟 +졠 +졡 +졢 +졣 +졤 +졥 +졦 +졧 +졨 +졩 +졪 +졫 +졬 +졭 +졮 +졯 +조 +족 +졲 +졳 +존 +졵 +졶 +졷 +졸 +졹 +졺 +졻 +졼 +졽 +졾 +졿 +좀 +좁 +좂 +좃 +좄 +종 +좆 +좇 +좈 +좉 +좊 +좋 +좌 +좍 +좎 +좏 +좐 +좑 +좒 +좓 +좔 +좕 +좖 +좗 +좘 +좙 +좚 +좛 +좜 +좝 +좞 +좟 +좠 +좡 +좢 +좣 +좤 +좥 +좦 +좧 +좨 +좩 +좪 +좫 +좬 +좭 +좮 +좯 +좰 +좱 +좲 +좳 +좴 +좵 +좶 +좷 +좸 +좹 +좺 +좻 +좼 +좽 +좾 +좿 +죀 +죁 +죂 +죃 +죄 +죅 +죆 +죇 +죈 +죉 +죊 +죋 +죌 +죍 +죎 +죏 +죐 +죑 +죒 +죓 +죔 +죕 +죖 +죗 +죘 +죙 +죚 +죛 +죜 +죝 +죞 +죟 +죠 +죡 +죢 +죣 +죤 +죥 +죦 +죧 +죨 +죩 +죪 +죫 +죬 +죭 +죮 +죯 +죰 +죱 +죲 +죳 +죴 +죵 +죶 +죷 +죸 +죹 +죺 +죻 +주 +죽 +죾 +죿 +준 +줁 +줂 +줃 +줄 +줅 +줆 +줇 +줈 +줉 +줊 +줋 +줌 +줍 +줎 +줏 +줐 +중 +줒 +줓 +줔 +줕 +줖 +줗 +줘 +줙 +줚 +줛 +줜 +줝 +줞 +줟 +줠 +줡 +줢 +줣 +줤 +줥 +줦 +줧 +줨 +줩 +줪 +줫 +줬 +줭 +줮 +줯 +줰 +줱 +줲 +줳 +줴 +줵 +줶 +줷 +줸 +줹 +줺 +줻 +줼 +줽 +줾 +줿 +쥀 +쥁 +쥂 +쥃 +쥄 +쥅 +쥆 +쥇 +쥈 +쥉 +쥊 +쥋 +쥌 +쥍 +쥎 +쥏 +쥐 +쥑 +쥒 +쥓 +쥔 +쥕 +쥖 +쥗 +쥘 +쥙 +쥚 +쥛 +쥜 +쥝 +쥞 +쥟 +쥠 +쥡 +쥢 +쥣 +쥤 +쥥 +쥦 +쥧 +쥨 +쥩 +쥪 +쥫 +쥬 +쥭 +쥮 +쥯 +쥰 +쥱 +쥲 +쥳 +쥴 +쥵 +쥶 +쥷 +쥸 +쥹 +쥺 +쥻 +쥼 +쥽 +쥾 +쥿 +즀 +즁 +즂 +즃 +즄 +즅 +즆 +즇 +즈 +즉 +즊 +즋 +즌 +즍 +즎 +즏 +즐 +즑 +즒 +즓 +즔 +즕 +즖 +즗 +즘 +즙 +즚 +즛 +즜 +증 +즞 +즟 +즠 +즡 +즢 +즣 +즤 +즥 +즦 +즧 +즨 +즩 +즪 +즫 +즬 +즭 +즮 +즯 +즰 +즱 +즲 +즳 +즴 +즵 +즶 +즷 +즸 +즹 +즺 +즻 +즼 +즽 +즾 +즿 +지 +직 +짂 +짃 +진 +짅 +짆 +짇 +질 +짉 +짊 +짋 +짌 +짍 +짎 +짏 +짐 +집 +짒 +짓 +짔 +징 +짖 +짗 +짘 +짙 +짚 +짛 +짜 +짝 +짞 +짟 +짠 +짡 +짢 +짣 +짤 +짥 +짦 +짧 +짨 +짩 +짪 +짫 +짬 +짭 +짮 +짯 +짰 +짱 +짲 +짳 +짴 +짵 +짶 +짷 +째 +짹 +짺 +짻 +짼 +짽 +짾 +짿 +쨀 +쨁 +쨂 +쨃 +쨄 +쨅 +쨆 +쨇 +쨈 +쨉 +쨊 +쨋 +쨌 +쨍 +쨎 +쨏 +쨐 +쨑 +쨒 +쨓 +쨔 +쨕 +쨖 +쨗 +쨘 +쨙 +쨚 +쨛 +쨜 +쨝 +쨞 +쨟 +쨠 +쨡 +쨢 +쨣 +쨤 +쨥 +쨦 +쨧 +쨨 +쨩 +쨪 +쨫 +쨬 +쨭 +쨮 +쨯 +쨰 +쨱 +쨲 +쨳 +쨴 +쨵 +쨶 +쨷 +쨸 +쨹 +쨺 +쨻 +쨼 +쨽 +쨾 +쨿 +쩀 +쩁 +쩂 +쩃 +쩄 +쩅 +쩆 +쩇 +쩈 +쩉 +쩊 +쩋 +쩌 +쩍 +쩎 +쩏 +쩐 +쩑 +쩒 +쩓 +쩔 +쩕 +쩖 +쩗 +쩘 +쩙 +쩚 +쩛 +쩜 +쩝 +쩞 +쩟 +쩠 +쩡 +쩢 +쩣 +쩤 +쩥 +쩦 +쩧 +쩨 +쩩 +쩪 +쩫 +쩬 +쩭 +쩮 +쩯 +쩰 +쩱 +쩲 +쩳 +쩴 +쩵 +쩶 +쩷 +쩸 +쩹 +쩺 +쩻 +쩼 +쩽 +쩾 +쩿 +쪀 +쪁 +쪂 +쪃 +쪄 +쪅 +쪆 +쪇 +쪈 +쪉 +쪊 +쪋 +쪌 +쪍 +쪎 +쪏 +쪐 +쪑 +쪒 +쪓 +쪔 +쪕 +쪖 +쪗 +쪘 +쪙 +쪚 +쪛 +쪜 +쪝 +쪞 +쪟 +쪠 +쪡 +쪢 +쪣 +쪤 +쪥 +쪦 +쪧 +쪨 +쪩 +쪪 +쪫 +쪬 +쪭 +쪮 +쪯 +쪰 +쪱 +쪲 +쪳 +쪴 +쪵 +쪶 +쪷 +쪸 +쪹 +쪺 +쪻 +쪼 +쪽 +쪾 +쪿 +쫀 +쫁 +쫂 +쫃 +쫄 +쫅 +쫆 +쫇 +쫈 +쫉 +쫊 +쫋 +쫌 +쫍 +쫎 +쫏 +쫐 +쫑 +쫒 +쫓 +쫔 +쫕 +쫖 +쫗 +쫘 +쫙 +쫚 +쫛 +쫜 +쫝 +쫞 +쫟 +쫠 +쫡 +쫢 +쫣 +쫤 +쫥 +쫦 +쫧 +쫨 +쫩 +쫪 +쫫 +쫬 +쫭 +쫮 +쫯 +쫰 +쫱 +쫲 +쫳 +쫴 +쫵 +쫶 +쫷 +쫸 +쫹 +쫺 +쫻 +쫼 +쫽 +쫾 +쫿 +쬀 +쬁 +쬂 +쬃 +쬄 +쬅 +쬆 +쬇 +쬈 +쬉 +쬊 +쬋 +쬌 +쬍 +쬎 +쬏 +쬐 +쬑 +쬒 +쬓 +쬔 +쬕 +쬖 +쬗 +쬘 +쬙 +쬚 +쬛 +쬜 +쬝 +쬞 +쬟 +쬠 +쬡 +쬢 +쬣 +쬤 +쬥 +쬦 +쬧 +쬨 +쬩 +쬪 +쬫 +쬬 +쬭 +쬮 +쬯 +쬰 +쬱 +쬲 +쬳 +쬴 +쬵 +쬶 +쬷 +쬸 +쬹 +쬺 +쬻 +쬼 +쬽 +쬾 +쬿 +쭀 +쭁 +쭂 +쭃 +쭄 +쭅 +쭆 +쭇 +쭈 +쭉 +쭊 +쭋 +쭌 +쭍 +쭎 +쭏 +쭐 +쭑 +쭒 +쭓 +쭔 +쭕 +쭖 +쭗 +쭘 +쭙 +쭚 +쭛 +쭜 +쭝 +쭞 +쭟 +쭠 +쭡 +쭢 +쭣 +쭤 +쭥 +쭦 +쭧 +쭨 +쭩 +쭪 +쭫 +쭬 +쭭 +쭮 +쭯 +쭰 +쭱 +쭲 +쭳 +쭴 +쭵 +쭶 +쭷 +쭸 +쭹 +쭺 +쭻 +쭼 +쭽 +쭾 +쭿 +쮀 +쮁 +쮂 +쮃 +쮄 +쮅 +쮆 +쮇 +쮈 +쮉 +쮊 +쮋 +쮌 +쮍 +쮎 +쮏 +쮐 +쮑 +쮒 +쮓 +쮔 +쮕 +쮖 +쮗 +쮘 +쮙 +쮚 +쮛 +쮜 +쮝 +쮞 +쮟 +쮠 +쮡 +쮢 +쮣 +쮤 +쮥 +쮦 +쮧 +쮨 +쮩 +쮪 +쮫 +쮬 +쮭 +쮮 +쮯 +쮰 +쮱 +쮲 +쮳 +쮴 +쮵 +쮶 +쮷 +쮸 +쮹 +쮺 +쮻 +쮼 +쮽 +쮾 +쮿 +쯀 +쯁 +쯂 +쯃 +쯄 +쯅 +쯆 +쯇 +쯈 +쯉 +쯊 +쯋 +쯌 +쯍 +쯎 +쯏 +쯐 +쯑 +쯒 +쯓 +쯔 +쯕 +쯖 +쯗 +쯘 +쯙 +쯚 +쯛 +쯜 +쯝 +쯞 +쯟 +쯠 +쯡 +쯢 +쯣 +쯤 +쯥 +쯦 +쯧 +쯨 +쯩 +쯪 +쯫 +쯬 +쯭 +쯮 +쯯 +쯰 +쯱 +쯲 +쯳 +쯴 +쯵 +쯶 +쯷 +쯸 +쯹 +쯺 +쯻 +쯼 +쯽 +쯾 +쯿 +찀 +찁 +찂 +찃 +찄 +찅 +찆 +찇 +찈 +찉 +찊 +찋 +찌 +찍 +찎 +찏 +찐 +찑 +찒 +찓 +찔 +찕 +찖 +찗 +찘 +찙 +찚 +찛 +찜 +찝 +찞 +찟 +찠 +찡 +찢 +찣 +찤 +찥 +찦 +찧 +차 +착 +찪 +찫 +찬 +찭 +찮 +찯 +찰 +찱 +찲 +찳 +찴 +찵 +찶 +찷 +참 +찹 +찺 +찻 +찼 +창 +찾 +찿 +챀 +챁 +챂 +챃 +채 +책 +챆 +챇 +챈 +챉 +챊 +챋 +챌 +챍 +챎 +챏 +챐 +챑 +챒 +챓 +챔 +챕 +챖 +챗 +챘 +챙 +챚 +챛 +챜 +챝 +챞 +챟 +챠 +챡 +챢 +챣 +챤 +챥 +챦 +챧 +챨 +챩 +챪 +챫 +챬 +챭 +챮 +챯 +챰 +챱 +챲 +챳 +챴 +챵 +챶 +챷 +챸 +챹 +챺 +챻 +챼 +챽 +챾 +챿 +첀 +첁 +첂 +첃 +첄 +첅 +첆 +첇 +첈 +첉 +첊 +첋 +첌 +첍 +첎 +첏 +첐 +첑 +첒 +첓 +첔 +첕 +첖 +첗 +처 +척 +첚 +첛 +천 +첝 +첞 +첟 +철 +첡 +첢 +첣 +첤 +첥 +첦 +첧 +첨 +첩 +첪 +첫 +첬 +청 +첮 +첯 +첰 +첱 +첲 +첳 +체 +첵 +첶 +첷 +첸 +첹 +첺 +첻 +첼 +첽 +첾 +첿 +쳀 +쳁 +쳂 +쳃 +쳄 +쳅 +쳆 +쳇 +쳈 +쳉 +쳊 +쳋 +쳌 +쳍 +쳎 +쳏 +쳐 +쳑 +쳒 +쳓 +쳔 +쳕 +쳖 +쳗 +쳘 +쳙 +쳚 +쳛 +쳜 +쳝 +쳞 +쳟 +쳠 +쳡 +쳢 +쳣 +쳤 +쳥 +쳦 +쳧 +쳨 +쳩 +쳪 +쳫 +쳬 +쳭 +쳮 +쳯 +쳰 +쳱 +쳲 +쳳 +쳴 +쳵 +쳶 +쳷 +쳸 +쳹 +쳺 +쳻 +쳼 +쳽 +쳾 +쳿 +촀 +촁 +촂 +촃 +촄 +촅 +촆 +촇 +초 +촉 +촊 +촋 +촌 +촍 +촎 +촏 +촐 +촑 +촒 +촓 +촔 +촕 +촖 +촗 +촘 +촙 +촚 +촛 +촜 +총 +촞 +촟 +촠 +촡 +촢 +촣 +촤 +촥 +촦 +촧 +촨 +촩 +촪 +촫 +촬 +촭 +촮 +촯 +촰 +촱 +촲 +촳 +촴 +촵 +촶 +촷 +촸 +촹 +촺 +촻 +촼 +촽 +촾 +촿 +쵀 +쵁 +쵂 +쵃 +쵄 +쵅 +쵆 +쵇 +쵈 +쵉 +쵊 +쵋 +쵌 +쵍 +쵎 +쵏 +쵐 +쵑 +쵒 +쵓 +쵔 +쵕 +쵖 +쵗 +쵘 +쵙 +쵚 +쵛 +최 +쵝 +쵞 +쵟 +쵠 +쵡 +쵢 +쵣 +쵤 +쵥 +쵦 +쵧 +쵨 +쵩 +쵪 +쵫 +쵬 +쵭 +쵮 +쵯 +쵰 +쵱 +쵲 +쵳 +쵴 +쵵 +쵶 +쵷 +쵸 +쵹 +쵺 +쵻 +쵼 +쵽 +쵾 +쵿 +춀 +춁 +춂 +춃 +춄 +춅 +춆 +춇 +춈 +춉 +춊 +춋 +춌 +춍 +춎 +춏 +춐 +춑 +춒 +춓 +추 +축 +춖 +춗 +춘 +춙 +춚 +춛 +출 +춝 +춞 +춟 +춠 +춡 +춢 +춣 +춤 +춥 +춦 +춧 +춨 +충 +춪 +춫 +춬 +춭 +춮 +춯 +춰 +춱 +춲 +춳 +춴 +춵 +춶 +춷 +춸 +춹 +춺 +춻 +춼 +춽 +춾 +춿 +췀 +췁 +췂 +췃 +췄 +췅 +췆 +췇 +췈 +췉 +췊 +췋 +췌 +췍 +췎 +췏 +췐 +췑 +췒 +췓 +췔 +췕 +췖 +췗 +췘 +췙 +췚 +췛 +췜 +췝 +췞 +췟 +췠 +췡 +췢 +췣 +췤 +췥 +췦 +췧 +취 +췩 +췪 +췫 +췬 +췭 +췮 +췯 +췰 +췱 +췲 +췳 +췴 +췵 +췶 +췷 +췸 +췹 +췺 +췻 +췼 +췽 +췾 +췿 +츀 +츁 +츂 +츃 +츄 +츅 +츆 +츇 +츈 +츉 +츊 +츋 +츌 +츍 +츎 +츏 +츐 +츑 +츒 +츓 +츔 +츕 +츖 +츗 +츘 +츙 +츚 +츛 +츜 +츝 +츞 +츟 +츠 +측 +츢 +츣 +츤 +츥 +츦 +츧 +츨 +츩 +츪 +츫 +츬 +츭 +츮 +츯 +츰 +츱 +츲 +츳 +츴 +층 +츶 +츷 +츸 +츹 +츺 +츻 +츼 +츽 +츾 +츿 +칀 +칁 +칂 +칃 +칄 +칅 +칆 +칇 +칈 +칉 +칊 +칋 +칌 +칍 +칎 +칏 +칐 +칑 +칒 +칓 +칔 +칕 +칖 +칗 +치 +칙 +칚 +칛 +친 +칝 +칞 +칟 +칠 +칡 +칢 +칣 +칤 +칥 +칦 +칧 +침 +칩 +칪 +칫 +칬 +칭 +칮 +칯 +칰 +칱 +칲 +칳 +카 +칵 +칶 +칷 +칸 +칹 +칺 +칻 +칼 +칽 +칾 +칿 +캀 +캁 +캂 +캃 +캄 +캅 +캆 +캇 +캈 +캉 +캊 +캋 +캌 +캍 +캎 +캏 +캐 +캑 +캒 +캓 +캔 +캕 +캖 +캗 +캘 +캙 +캚 +캛 +캜 +캝 +캞 +캟 +캠 +캡 +캢 +캣 +캤 +캥 +캦 +캧 +캨 +캩 +캪 +캫 +캬 +캭 +캮 +캯 +캰 +캱 +캲 +캳 +캴 +캵 +캶 +캷 +캸 +캹 +캺 +캻 +캼 +캽 +캾 +캿 +컀 +컁 +컂 +컃 +컄 +컅 +컆 +컇 +컈 +컉 +컊 +컋 +컌 +컍 +컎 +컏 +컐 +컑 +컒 +컓 +컔 +컕 +컖 +컗 +컘 +컙 +컚 +컛 +컜 +컝 +컞 +컟 +컠 +컡 +컢 +컣 +커 +컥 +컦 +컧 +컨 +컩 +컪 +컫 +컬 +컭 +컮 +컯 +컰 +컱 +컲 +컳 +컴 +컵 +컶 +컷 +컸 +컹 +컺 +컻 +컼 +컽 +컾 +컿 +케 +켁 +켂 +켃 +켄 +켅 +켆 +켇 +켈 +켉 +켊 +켋 +켌 +켍 +켎 +켏 +켐 +켑 +켒 +켓 +켔 +켕 +켖 +켗 +켘 +켙 +켚 +켛 +켜 +켝 +켞 +켟 +켠 +켡 +켢 +켣 +켤 +켥 +켦 +켧 +켨 +켩 +켪 +켫 +켬 +켭 +켮 +켯 +켰 +켱 +켲 +켳 +켴 +켵 +켶 +켷 +켸 +켹 +켺 +켻 +켼 +켽 +켾 +켿 +콀 +콁 +콂 +콃 +콄 +콅 +콆 +콇 +콈 +콉 +콊 +콋 +콌 +콍 +콎 +콏 +콐 +콑 +콒 +콓 +코 +콕 +콖 +콗 +콘 +콙 +콚 +콛 +콜 +콝 +콞 +콟 +콠 +콡 +콢 +콣 +콤 +콥 +콦 +콧 +콨 +콩 +콪 +콫 +콬 +콭 +콮 +콯 +콰 +콱 +콲 +콳 +콴 +콵 +콶 +콷 +콸 +콹 +콺 +콻 +콼 +콽 +콾 +콿 +쾀 +쾁 +쾂 +쾃 +쾄 +쾅 +쾆 +쾇 +쾈 +쾉 +쾊 +쾋 +쾌 +쾍 +쾎 +쾏 +쾐 +쾑 +쾒 +쾓 +쾔 +쾕 +쾖 +쾗 +쾘 +쾙 +쾚 +쾛 +쾜 +쾝 +쾞 +쾟 +쾠 +쾡 +쾢 +쾣 +쾤 +쾥 +쾦 +쾧 +쾨 +쾩 +쾪 +쾫 +쾬 +쾭 +쾮 +쾯 +쾰 +쾱 +쾲 +쾳 +쾴 +쾵 +쾶 +쾷 +쾸 +쾹 +쾺 +쾻 +쾼 +쾽 +쾾 +쾿 +쿀 +쿁 +쿂 +쿃 +쿄 +쿅 +쿆 +쿇 +쿈 +쿉 +쿊 +쿋 +쿌 +쿍 +쿎 +쿏 +쿐 +쿑 +쿒 +쿓 +쿔 +쿕 +쿖 +쿗 +쿘 +쿙 +쿚 +쿛 +쿜 +쿝 +쿞 +쿟 +쿠 +쿡 +쿢 +쿣 +쿤 +쿥 +쿦 +쿧 +쿨 +쿩 +쿪 +쿫 +쿬 +쿭 +쿮 +쿯 +쿰 +쿱 +쿲 +쿳 +쿴 +쿵 +쿶 +쿷 +쿸 +쿹 +쿺 +쿻 +쿼 +쿽 +쿾 +쿿 +퀀 +퀁 +퀂 +퀃 +퀄 +퀅 +퀆 +퀇 +퀈 +퀉 +퀊 +퀋 +퀌 +퀍 +퀎 +퀏 +퀐 +퀑 +퀒 +퀓 +퀔 +퀕 +퀖 +퀗 +퀘 +퀙 +퀚 +퀛 +퀜 +퀝 +퀞 +퀟 +퀠 +퀡 +퀢 +퀣 +퀤 +퀥 +퀦 +퀧 +퀨 +퀩 +퀪 +퀫 +퀬 +퀭 +퀮 +퀯 +퀰 +퀱 +퀲 +퀳 +퀴 +퀵 +퀶 +퀷 +퀸 +퀹 +퀺 +퀻 +퀼 +퀽 +퀾 +퀿 +큀 +큁 +큂 +큃 +큄 +큅 +큆 +큇 +큈 +큉 +큊 +큋 +큌 +큍 +큎 +큏 +큐 +큑 +큒 +큓 +큔 +큕 +큖 +큗 +큘 +큙 +큚 +큛 +큜 +큝 +큞 +큟 +큠 +큡 +큢 +큣 +큤 +큥 +큦 +큧 +큨 +큩 +큪 +큫 +크 +큭 +큮 +큯 +큰 +큱 +큲 +큳 +클 +큵 +큶 +큷 +큸 +큹 +큺 +큻 +큼 +큽 +큾 +큿 +킀 +킁 +킂 +킃 +킄 +킅 +킆 +킇 +킈 +킉 +킊 +킋 +킌 +킍 +킎 +킏 +킐 +킑 +킒 +킓 +킔 +킕 +킖 +킗 +킘 +킙 +킚 +킛 +킜 +킝 +킞 +킟 +킠 +킡 +킢 +킣 +키 +킥 +킦 +킧 +킨 +킩 +킪 +킫 +킬 +킭 +킮 +킯 +킰 +킱 +킲 +킳 +킴 +킵 +킶 +킷 +킸 +킹 +킺 +킻 +킼 +킽 +킾 +킿 +타 +탁 +탂 +탃 +탄 +탅 +탆 +탇 +탈 +탉 +탊 +탋 +탌 +탍 +탎 +탏 +탐 +탑 +탒 +탓 +탔 +탕 +탖 +탗 +탘 +탙 +탚 +탛 +태 +택 +탞 +탟 +탠 +탡 +탢 +탣 +탤 +탥 +탦 +탧 +탨 +탩 +탪 +탫 +탬 +탭 +탮 +탯 +탰 +탱 +탲 +탳 +탴 +탵 +탶 +탷 +탸 +탹 +탺 +탻 +탼 +탽 +탾 +탿 +턀 +턁 +턂 +턃 +턄 +턅 +턆 +턇 +턈 +턉 +턊 +턋 +턌 +턍 +턎 +턏 +턐 +턑 +턒 +턓 +턔 +턕 +턖 +턗 +턘 +턙 +턚 +턛 +턜 +턝 +턞 +턟 +턠 +턡 +턢 +턣 +턤 +턥 +턦 +턧 +턨 +턩 +턪 +턫 +턬 +턭 +턮 +턯 +터 +턱 +턲 +턳 +턴 +턵 +턶 +턷 +털 +턹 +턺 +턻 +턼 +턽 +턾 +턿 +텀 +텁 +텂 +텃 +텄 +텅 +텆 +텇 +텈 +텉 +텊 +텋 +테 +텍 +텎 +텏 +텐 +텑 +텒 +텓 +텔 +텕 +텖 +텗 +텘 +텙 +텚 +텛 +템 +텝 +텞 +텟 +텠 +텡 +텢 +텣 +텤 +텥 +텦 +텧 +텨 +텩 +텪 +텫 +텬 +텭 +텮 +텯 +텰 +텱 +텲 +텳 +텴 +텵 +텶 +텷 +텸 +텹 +텺 +텻 +텼 +텽 +텾 +텿 +톀 +톁 +톂 +톃 +톄 +톅 +톆 +톇 +톈 +톉 +톊 +톋 +톌 +톍 +톎 +톏 +톐 +톑 +톒 +톓 +톔 +톕 +톖 +톗 +톘 +톙 +톚 +톛 +톜 +톝 +톞 +톟 +토 +톡 +톢 +톣 +톤 +톥 +톦 +톧 +톨 +톩 +톪 +톫 +톬 +톭 +톮 +톯 +톰 +톱 +톲 +톳 +톴 +통 +톶 +톷 +톸 +톹 +톺 +톻 +톼 +톽 +톾 +톿 +퇀 +퇁 +퇂 +퇃 +퇄 +퇅 +퇆 +퇇 +퇈 +퇉 +퇊 +퇋 +퇌 +퇍 +퇎 +퇏 +퇐 +퇑 +퇒 +퇓 +퇔 +퇕 +퇖 +퇗 +퇘 +퇙 +퇚 +퇛 +퇜 +퇝 +퇞 +퇟 +퇠 +퇡 +퇢 +퇣 +퇤 +퇥 +퇦 +퇧 +퇨 +퇩 +퇪 +퇫 +퇬 +퇭 +퇮 +퇯 +퇰 +퇱 +퇲 +퇳 +퇴 +퇵 +퇶 +퇷 +퇸 +퇹 +퇺 +퇻 +퇼 +퇽 +퇾 +퇿 +툀 +툁 +툂 +툃 +툄 +툅 +툆 +툇 +툈 +툉 +툊 +툋 +툌 +툍 +툎 +툏 +툐 +툑 +툒 +툓 +툔 +툕 +툖 +툗 +툘 +툙 +툚 +툛 +툜 +툝 +툞 +툟 +툠 +툡 +툢 +툣 +툤 +툥 +툦 +툧 +툨 +툩 +툪 +툫 +투 +툭 +툮 +툯 +툰 +툱 +툲 +툳 +툴 +툵 +툶 +툷 +툸 +툹 +툺 +툻 +툼 +툽 +툾 +툿 +퉀 +퉁 +퉂 +퉃 +퉄 +퉅 +퉆 +퉇 +퉈 +퉉 +퉊 +퉋 +퉌 +퉍 +퉎 +퉏 +퉐 +퉑 +퉒 +퉓 +퉔 +퉕 +퉖 +퉗 +퉘 +퉙 +퉚 +퉛 +퉜 +퉝 +퉞 +퉟 +퉠 +퉡 +퉢 +퉣 +퉤 +퉥 +퉦 +퉧 +퉨 +퉩 +퉪 +퉫 +퉬 +퉭 +퉮 +퉯 +퉰 +퉱 +퉲 +퉳 +퉴 +퉵 +퉶 +퉷 +퉸 +퉹 +퉺 +퉻 +퉼 +퉽 +퉾 +퉿 +튀 +튁 +튂 +튃 +튄 +튅 +튆 +튇 +튈 +튉 +튊 +튋 +튌 +튍 +튎 +튏 +튐 +튑 +튒 +튓 +튔 +튕 +튖 +튗 +튘 +튙 +튚 +튛 +튜 +튝 +튞 +튟 +튠 +튡 +튢 +튣 +튤 +튥 +튦 +튧 +튨 +튩 +튪 +튫 +튬 +튭 +튮 +튯 +튰 +튱 +튲 +튳 +튴 +튵 +튶 +튷 +트 +특 +튺 +튻 +튼 +튽 +튾 +튿 +틀 +틁 +틂 +틃 +틄 +틅 +틆 +틇 +틈 +틉 +틊 +틋 +틌 +틍 +틎 +틏 +틐 +틑 +틒 +틓 +틔 +틕 +틖 +틗 +틘 +틙 +틚 +틛 +틜 +틝 +틞 +틟 +틠 +틡 +틢 +틣 +틤 +틥 +틦 +틧 +틨 +틩 +틪 +틫 +틬 +틭 +틮 +틯 +티 +틱 +틲 +틳 +틴 +틵 +틶 +틷 +틸 +틹 +틺 +틻 +틼 +틽 +틾 +틿 +팀 +팁 +팂 +팃 +팄 +팅 +팆 +팇 +팈 +팉 +팊 +팋 +파 +팍 +팎 +팏 +판 +팑 +팒 +팓 +팔 +팕 +팖 +팗 +팘 +팙 +팚 +팛 +팜 +팝 +팞 +팟 +팠 +팡 +팢 +팣 +팤 +팥 +팦 +팧 +패 +팩 +팪 +팫 +팬 +팭 +팮 +팯 +팰 +팱 +팲 +팳 +팴 +팵 +팶 +팷 +팸 +팹 +팺 +팻 +팼 +팽 +팾 +팿 +퍀 +퍁 +퍂 +퍃 +퍄 +퍅 +퍆 +퍇 +퍈 +퍉 +퍊 +퍋 +퍌 +퍍 +퍎 +퍏 +퍐 +퍑 +퍒 +퍓 +퍔 +퍕 +퍖 +퍗 +퍘 +퍙 +퍚 +퍛 +퍜 +퍝 +퍞 +퍟 +퍠 +퍡 +퍢 +퍣 +퍤 +퍥 +퍦 +퍧 +퍨 +퍩 +퍪 +퍫 +퍬 +퍭 +퍮 +퍯 +퍰 +퍱 +퍲 +퍳 +퍴 +퍵 +퍶 +퍷 +퍸 +퍹 +퍺 +퍻 +퍼 +퍽 +퍾 +퍿 +펀 +펁 +펂 +펃 +펄 +펅 +펆 +펇 +펈 +펉 +펊 +펋 +펌 +펍 +펎 +펏 +펐 +펑 +펒 +펓 +펔 +펕 +펖 +펗 +페 +펙 +펚 +펛 +펜 +펝 +펞 +펟 +펠 +펡 +펢 +펣 +펤 +펥 +펦 +펧 +펨 +펩 +펪 +펫 +펬 +펭 +펮 +펯 +펰 +펱 +펲 +펳 +펴 +펵 +펶 +펷 +편 +펹 +펺 +펻 +펼 +펽 +펾 +펿 +폀 +폁 +폂 +폃 +폄 +폅 +폆 +폇 +폈 +평 +폊 +폋 +폌 +폍 +폎 +폏 +폐 +폑 +폒 +폓 +폔 +폕 +폖 +폗 +폘 +폙 +폚 +폛 +폜 +폝 +폞 +폟 +폠 +폡 +폢 +폣 +폤 +폥 +폦 +폧 +폨 +폩 +폪 +폫 +포 +폭 +폮 +폯 +폰 +폱 +폲 +폳 +폴 +폵 +폶 +폷 +폸 +폹 +폺 +폻 +폼 +폽 +폾 +폿 +퐀 +퐁 +퐂 +퐃 +퐄 +퐅 +퐆 +퐇 +퐈 +퐉 +퐊 +퐋 +퐌 +퐍 +퐎 +퐏 +퐐 +퐑 +퐒 +퐓 +퐔 +퐕 +퐖 +퐗 +퐘 +퐙 +퐚 +퐛 +퐜 +퐝 +퐞 +퐟 +퐠 +퐡 +퐢 +퐣 +퐤 +퐥 +퐦 +퐧 +퐨 +퐩 +퐪 +퐫 +퐬 +퐭 +퐮 +퐯 +퐰 +퐱 +퐲 +퐳 +퐴 +퐵 +퐶 +퐷 +퐸 +퐹 +퐺 +퐻 +퐼 +퐽 +퐾 +퐿 +푀 +푁 +푂 +푃 +푄 +푅 +푆 +푇 +푈 +푉 +푊 +푋 +푌 +푍 +푎 +푏 +푐 +푑 +푒 +푓 +푔 +푕 +푖 +푗 +푘 +푙 +푚 +푛 +표 +푝 +푞 +푟 +푠 +푡 +푢 +푣 +푤 +푥 +푦 +푧 +푨 +푩 +푪 +푫 +푬 +푭 +푮 +푯 +푰 +푱 +푲 +푳 +푴 +푵 +푶 +푷 +푸 +푹 +푺 +푻 +푼 +푽 +푾 +푿 +풀 +풁 +풂 +풃 +풄 +풅 +풆 +풇 +품 +풉 +풊 +풋 +풌 +풍 +풎 +풏 +풐 +풑 +풒 +풓 +풔 +풕 +풖 +풗 +풘 +풙 +풚 +풛 +풜 +풝 +풞 +풟 +풠 +풡 +풢 +풣 +풤 +풥 +풦 +풧 +풨 +풩 +풪 +풫 +풬 +풭 +풮 +풯 +풰 +풱 +풲 +풳 +풴 +풵 +풶 +풷 +풸 +풹 +풺 +풻 +풼 +풽 +풾 +풿 +퓀 +퓁 +퓂 +퓃 +퓄 +퓅 +퓆 +퓇 +퓈 +퓉 +퓊 +퓋 +퓌 +퓍 +퓎 +퓏 +퓐 +퓑 +퓒 +퓓 +퓔 +퓕 +퓖 +퓗 +퓘 +퓙 +퓚 +퓛 +퓜 +퓝 +퓞 +퓟 +퓠 +퓡 +퓢 +퓣 +퓤 +퓥 +퓦 +퓧 +퓨 +퓩 +퓪 +퓫 +퓬 +퓭 +퓮 +퓯 +퓰 +퓱 +퓲 +퓳 +퓴 +퓵 +퓶 +퓷 +퓸 +퓹 +퓺 +퓻 +퓼 +퓽 +퓾 +퓿 +픀 +픁 +픂 +픃 +프 +픅 +픆 +픇 +픈 +픉 +픊 +픋 +플 +픍 +픎 +픏 +픐 +픑 +픒 +픓 +픔 +픕 +픖 +픗 +픘 +픙 +픚 +픛 +픜 +픝 +픞 +픟 +픠 +픡 +픢 +픣 +픤 +픥 +픦 +픧 +픨 +픩 +픪 +픫 +픬 +픭 +픮 +픯 +픰 +픱 +픲 +픳 +픴 +픵 +픶 +픷 +픸 +픹 +픺 +픻 +피 +픽 +픾 +픿 +핀 +핁 +핂 +핃 +필 +핅 +핆 +핇 +핈 +핉 +핊 +핋 +핌 +핍 +핎 +핏 +핐 +핑 +핒 +핓 +핔 +핕 +핖 +핗 +하 +학 +핚 +핛 +한 +핝 +핞 +핟 +할 +핡 +핢 +핣 +핤 +핥 +핦 +핧 +함 +합 +핪 +핫 +핬 +항 +핮 +핯 +핰 +핱 +핲 +핳 +해 +핵 +핶 +핷 +핸 +핹 +핺 +핻 +핼 +핽 +핾 +핿 +햀 +햁 +햂 +햃 +햄 +햅 +햆 +햇 +했 +행 +햊 +햋 +햌 +햍 +햎 +햏 +햐 +햑 +햒 +햓 +햔 +햕 +햖 +햗 +햘 +햙 +햚 +햛 +햜 +햝 +햞 +햟 +햠 +햡 +햢 +햣 +햤 +향 +햦 +햧 +햨 +햩 +햪 +햫 +햬 +햭 +햮 +햯 +햰 +햱 +햲 +햳 +햴 +햵 +햶 +햷 +햸 +햹 +햺 +햻 +햼 +햽 +햾 +햿 +헀 +헁 +헂 +헃 +헄 +헅 +헆 +헇 +허 +헉 +헊 +헋 +헌 +헍 +헎 +헏 +헐 +헑 +헒 +헓 +헔 +헕 +헖 +헗 +험 +헙 +헚 +헛 +헜 +헝 +헞 +헟 +헠 +헡 +헢 +헣 +헤 +헥 +헦 +헧 +헨 +헩 +헪 +헫 +헬 +헭 +헮 +헯 +헰 +헱 +헲 +헳 +헴 +헵 +헶 +헷 +헸 +헹 +헺 +헻 +헼 +헽 +헾 +헿 +혀 +혁 +혂 +혃 +현 +혅 +혆 +혇 +혈 +혉 +혊 +혋 +혌 +혍 +혎 +혏 +혐 +협 +혒 +혓 +혔 +형 +혖 +혗 +혘 +혙 +혚 +혛 +혜 +혝 +혞 +혟 +혠 +혡 +혢 +혣 +혤 +혥 +혦 +혧 +혨 +혩 +혪 +혫 +혬 +혭 +혮 +혯 +혰 +혱 +혲 +혳 +혴 +혵 +혶 +혷 +호 +혹 +혺 +혻 +혼 +혽 +혾 +혿 +홀 +홁 +홂 +홃 +홄 +홅 +홆 +홇 +홈 +홉 +홊 +홋 +홌 +홍 +홎 +홏 +홐 +홑 +홒 +홓 +화 +확 +홖 +홗 +환 +홙 +홚 +홛 +활 +홝 +홞 +홟 +홠 +홡 +홢 +홣 +홤 +홥 +홦 +홧 +홨 +황 +홪 +홫 +홬 +홭 +홮 +홯 +홰 +홱 +홲 +홳 +홴 +홵 +홶 +홷 +홸 +홹 +홺 +홻 +홼 +홽 +홾 +홿 +횀 +횁 +횂 +횃 +횄 +횅 +횆 +횇 +횈 +횉 +횊 +횋 +회 +획 +횎 +횏 +횐 +횑 +횒 +횓 +횔 +횕 +횖 +횗 +횘 +횙 +횚 +횛 +횜 +횝 +횞 +횟 +횠 +횡 +횢 +횣 +횤 +횥 +횦 +횧 +효 +횩 +횪 +횫 +횬 +횭 +횮 +횯 +횰 +횱 +횲 +횳 +횴 +횵 +횶 +횷 +횸 +횹 +횺 +횻 +횼 +횽 +횾 +횿 +훀 +훁 +훂 +훃 +후 +훅 +훆 +훇 +훈 +훉 +훊 +훋 +훌 +훍 +훎 +훏 +훐 +훑 +훒 +훓 +훔 +훕 +훖 +훗 +훘 +훙 +훚 +훛 +훜 +훝 +훞 +훟 +훠 +훡 +훢 +훣 +훤 +훥 +훦 +훧 +훨 +훩 +훪 +훫 +훬 +훭 +훮 +훯 +훰 +훱 +훲 +훳 +훴 +훵 +훶 +훷 +훸 +훹 +훺 +훻 +훼 +훽 +훾 +훿 +휀 +휁 +휂 +휃 +휄 +휅 +휆 +휇 +휈 +휉 +휊 +휋 +휌 +휍 +휎 +휏 +휐 +휑 +휒 +휓 +휔 +휕 +휖 +휗 +휘 +휙 +휚 +휛 +휜 +휝 +휞 +휟 +휠 +휡 +휢 +휣 +휤 +휥 +휦 +휧 +휨 +휩 +휪 +휫 +휬 +휭 +휮 +휯 +휰 +휱 +휲 +휳 +휴 +휵 +휶 +휷 +휸 +휹 +휺 +휻 +휼 +휽 +휾 +휿 +흀 +흁 +흂 +흃 +흄 +흅 +흆 +흇 +흈 +흉 +흊 +흋 +흌 +흍 +흎 +흏 +흐 +흑 +흒 +흓 +흔 +흕 +흖 +흗 +흘 +흙 +흚 +흛 +흜 +흝 +흞 +흟 +흠 +흡 +흢 +흣 +흤 +흥 +흦 +흧 +흨 +흩 +흪 +흫 +희 +흭 +흮 +흯 +흰 +흱 +흲 +흳 +흴 +흵 +흶 +흷 +흸 +흹 +흺 +흻 +흼 +흽 +흾 +흿 +힀 +힁 +힂 +힃 +힄 +힅 +힆 +힇 +히 +힉 +힊 +힋 +힌 +힍 +힎 +힏 +힐 +힑 +힒 +힓 +힔 +힕 +힖 +힗 +힘 +힙 +힚 +힛 +힜 +힝 +힞 +힟 +힠 +힡 +힢 +힣 +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +© +‥ +{ +} +\ +| +@ +^ +~ +÷ +∕ +∙ +⋅ +· +± +∓ +∩ +∪ +□ +← +↔ +⇒ +⇐ +⇔ +∀ +∃ +∄ +∴ +∵ +∝ +∞ +⊥ +∟ +∠ +∡ +∢ +′ +″ +∥ +⊾ +⊿ +∂ +∫ +∬ +∭ +∮ +∯ +∰ +∑ +∏ +√ +∛ +∜ +∱ +∲ +∳ +∶ +∷ +∼ +® +℉ +Ω +℧ +Å +⌀ +ℏ +⅀ +⍺ +⍵ +¢ +€ +£ +¥ +₿ +Ⅰ +Ⅱ +Ⅲ +Ⅳ +Ⅴ +Ⅵ +Ⅶ +Ⅷ +Ⅸ +Ⅹ +Ⅺ +Ⅻ +ⅰ +ⅱ +ⅲ +ⅳ +ⅴ +ⅵ +ⅶ +ⅷ +ⅸ +ⅹ +ⅺ +ⅻ +➀ +➁ +➂ +➃ +➄ +➅ +➆ +➇ +➈ +➉ +➊ +➋ +➌ +➍ +➎ +➏ +➐ +➑ +➒ +➓ +❶ +❷ +❸ +❹ +❺ +❻ +❼ +❽ +❾ +❿ +① +② +③ +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +● +▶ +𝑢 +︽ +– +﹥ +𝜓 +• +∋ +ƒ +० +⬆ +Ạ +◀ + +▫ +︾ +À +Á + +à +Ä +Å +Æ +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ð +Ñ +Ò +Ó +Ô +Õ +Ö +Ø +Ù +Ú +Û +Ü +Ý +Þ +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ð +ñ +ò +ó +ô +õ +ö +ø +ù +ú +û +ü +ý +þ +ÿ +¡ +¤ +¦ +§ +¨ +ª +« +¬ +¯ +° +² +³ +´ +µ +¶ +¸ +¹ +º +» +¼ +½ +¾ +¿ +× +‐ +‑ +‒ +— +― +‖ +‗ +‘ +’ +‚ +‛ +“ +” +„ +‟ +† +‡ +‣ +․ +… +‧ +‰ +‴ +‵ +‶ +‷ +‸ +‹ +› +※ +‼ +‽ +‾ +₤ +₡ +₹ +− +∖ +∗ +≈ +≠ +≡ +≤ +≥ +⊂ +⊃ +↑ +→ +↓ +↕ +™ +Ω +℮ +∆ +✓ +✗ +✘ +▪ +◼ +✔ +✕ +☑ +☒ +№ +₽ +₴ +Α +α +Β +β +Γ +γ +Δ +δ +Ε +ε +Ζ +ζ +Η +η +Θ +θ +Ι +ι +Κ +κ +Λ +λ +Μ +μ +Ν +ν +Ξ +ξ +Ο +ο +Π +π +Ρ +ρ +Σ +σ +ς +Τ +τ +Υ +υ +Φ +φ +Χ +χ +Ψ +ψ +ω diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2497aec268ef61cac813a7d34181fd960f8ded6 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocrv5_latin_dict.txt @@ -0,0 +1,502 @@ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +¡ +¢ +£ +¤ +¥ +¦ +§ +¨ +© +ª +« +¬ +­ +® +¯ +° +± +² +³ +´ +µ +¶ +· +¸ +¹ +º +» +¼ +½ +¾ +¿ +À +Á + +à +Ä +Å +Æ +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ð +Ñ +Ò +Ó +Ô +Õ +Ö +× +Ø +Ù +Ú +Û +Ü +Ý +Þ +ß +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ð +ñ +ò +ó +ô +õ +ö +÷ +ø +ù +ú +û +ü +ý +þ +ÿ +Ą +ą +Ć +ć +Č +č +Ď +ď +Đ +đ +Ė +ė +Ę +ę +Ě +ě +Ğ +ğ +Į +į +İ +ı +Ĺ +ĺ +Ľ +ľ +Ł +ł +Ń +ń +Ň +ň +ō +Ő +ő +Œ +œ +Ŕ +ŕ +Ř +ř +Ś +ś +Ş +ş +Š +š +Ť +ť +Ū +ū +Ů +ů +Ű +ű +Ų +ų +Ÿ +Ź +ź +Ż +ż +Ž +ž +ƒ +ʒ +Ω +α +β +γ +δ +ε +ζ +η +θ +ι +κ +λ +μ +ν +ξ +ο +π +ρ +ς +σ +τ +υ +φ +χ +ψ +ω +з +० +Ṡ +ẞ +Ạ +‐ +‑ +‒ +– +— +― +‖ +‗ +‘ +’ +‚ +‛ +“ +” +„ +‟ +† +‡ +• +‣ +․ +‥ +… +‧ +‰ +′ +″ +‴ +‵ +‶ +‷ +‸ +‹ +› +※ +‼ +‽ +‾ +⁄ +₂ +₃ +₡ +₤ +€ +₴ +₹ +₽ +₿ +℉ +ℏ +№ +™ +Ω +℧ +Å +℮ +⅀ +Ⅰ +Ⅱ +Ⅲ +Ⅳ +Ⅴ +Ⅵ +Ⅶ +Ⅷ +Ⅸ +Ⅹ +Ⅺ +Ⅻ +ⅰ +ⅱ +ⅲ +ⅳ +ⅴ +ⅵ +ⅶ +ⅷ +ⅸ +ⅹ +ⅺ +ⅻ +← +↑ +→ +↓ +↔ +↕ +⇐ +⇒ +⇔ +∀ +∂ +∃ +∄ +∅ +∆ +∋ +∏ +∑ +− +∓ +∕ +∖ +∗ +∙ +√ +∛ +∜ +∝ +∞ +∟ +∠ +∡ +∢ +∥ +∧ +∨ +∩ +∪ +∫ +∬ +∭ +∮ +∯ +∰ +∱ +∲ +∳ +∴ +∵ +∶ +∷ +∼ +≈ +≠ +≡ +≤ +≥ +⊂ +⊃ +⊥ +⊾ +⊿ +⋅ +⌀ +⍵ +⍺ +① +② +③ +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +─ +│ +└ +├ +■ +□ +▪ +▫ +▶ +◀ +● +◼ +☑ +☒ +✓ +✔ +✕ +✗ +✘ +❶ +❷ +❸ +❹ +❺ +❻ +❼ +❽ +❾ +❿ +➀ +➁ +➂ +➃ +➄ +➅ +➆ +➇ +➈ +➉ +➊ +➋ +➌ +➍ +➎ +➏ +➐ +➑ +➒ +➓ +⬆ +、 +fi +fl +︽ +︾ +﹥ +� +𝑢 +𝜓 diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..19d81892c205627f296adbf8b20ea41aba2de5d0 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt @@ -0,0 +1,128 @@ +t +a +_ +i +m +g +/ +3 +I +L +S +V +R +C +2 +0 +1 +v +l +9 +7 +8 +. +j +p +ப +ூ +த +ம +ி +வ +ர +் +ந +ோ +ன +6 +ஆ +ற +ல +5 +ள +ா +ொ +ழ +ு +4 +ெ +ண +க +ட +ை +ே +ச +ய +ஒ +இ +அ +ங +உ +ீ +ஞ +எ +ஓ +ஃ +ஜ +ஷ +ஸ +ஏ +ஊ +ஹ +ஈ +ஐ +ௌ +ஔ +s +c +e +n +w +F +T +O +P +K +A +N +G +Y +E +M +H +U +B +o +b +D +d +r +W +u +y +f +X +k +q +h +J +z +Z +Q +x +- +' +$ +, +% +@ +é +! +# ++ +É +& +: +( +? + diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt new file mode 100644 index 0000000000000000000000000000000000000000..83d74cc7e5f899ca43b23fa690d84d70bee535e3 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt @@ -0,0 +1,151 @@ +t +e +_ +i +m +g +/ +5 +I +L +S +V +R +C +2 +0 +1 +v +a +l +3 +4 +8 +9 +. +j +p +త +ె +ర +క +్ +ి +ం +చ +ే +ద +ు +7 +6 +ఉ +ా +మ +ట +ో +వ +ప +ల +శ +ఆ +య +ై +భ +' +ీ +గ +ూ +డ +ధ +హ +న +జ +స +[ +‌ +ష +అ +ణ +ఫ +బ +ఎ +; +ళ +థ +ొ +ఠ +ృ +ఒ +ఇ +ః +ఊ +ఖ +- +ఐ +ఘ +ౌ +ఏ +ఈ +ఛ +, +ఓ +ఞ +| +? +: +ఢ +" +( +” +! ++ +) +* += +& +“ +€ +] +£ +$ +s +c +n +w +k +J +G +u +d +r +E +o +h +y +b +f +B +M +O +T +N +D +P +A +F +x +W +Y +U +H +K +X +z +Z +Q +q +É +% +# +@ +é diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml new file mode 100644 index 0000000000000000000000000000000000000000..da36bf448662d74cfa93d4bc45d5f11597b334f8 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml @@ -0,0 +1,69 @@ +lang: + ch_lite: + det: ch_PP-OCRv5_det_infer.pth + rec: ch_PP-OCRv5_rec_infer.pth + dict: ppocrv5_dict.txt + ch_lite_v4: + det: ch_PP-OCRv5_det_infer.pth + rec: ch_PP-OCRv4_rec_infer.pth + dict: ppocr_keys_v1.txt + ch_server: + det: ch_PP-OCRv5_det_infer.pth + rec: ch_PP-OCRv5_rec_server_infer.pth + dict: ppocrv5_dict.txt + ch_server_v4: + det: ch_PP-OCRv5_det_infer.pth + rec: ch_PP-OCRv4_rec_server_infer.pth + dict: ppocr_keys_v1.txt + ch: + det: ch_PP-OCRv5_det_infer.pth + rec: ch_PP-OCRv4_rec_server_doc_infer.pth + dict: ppocrv4_doc_dict.txt + en: + det: en_PP-OCRv3_det_infer.pth + rec: en_PP-OCRv4_rec_infer.pth + dict: en_dict.txt + korean: + det: ch_PP-OCRv5_det_infer.pth + rec: korean_PP-OCRv5_rec_infer.pth + dict: ppocrv5_korean_dict.txt + japan: + det: ch_PP-OCRv5_det_infer.pth + rec: ch_PP-OCRv5_rec_server_infer.pth + dict: ppocrv5_dict.txt + chinese_cht: + det: ch_PP-OCRv5_det_infer.pth + rec: ch_PP-OCRv5_rec_server_infer.pth + dict: ppocrv5_dict.txt + ta: + det: Multilingual_PP-OCRv3_det_infer.pth + rec: ta_PP-OCRv3_rec_infer.pth + dict: ta_dict.txt + te: + det: Multilingual_PP-OCRv3_det_infer.pth + rec: te_PP-OCRv3_rec_infer.pth + dict: te_dict.txt + ka: + det: Multilingual_PP-OCRv3_det_infer.pth + rec: ka_PP-OCRv3_rec_infer.pth + dict: ka_dict.txt + latin: + det: ch_PP-OCRv5_det_infer.pth + rec: latin_PP-OCRv5_rec_infer.pth + dict: ppocrv5_latin_dict.txt + arabic: + det: Multilingual_PP-OCRv3_det_infer.pth + rec: arabic_PP-OCRv3_rec_infer.pth + dict: arabic_dict.txt + cyrillic: + det: Multilingual_PP-OCRv3_det_infer.pth + rec: cyrillic_PP-OCRv3_rec_infer.pth + dict: cyrillic_dict.txt + devanagari: + det: Multilingual_PP-OCRv3_det_infer.pth + rec: devanagari_PP-OCRv3_rec_infer.pth + dict: devanagari_dict.txt + east_slavic: + det: ch_PP-OCRv5_det_infer.pth + rec: eslav_PP-OCRv5_rec_infer.pth + dict: ppocrv5_eslav_dict.txt \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f64ba567a631a847c6c2ea3d345f86865056cb53 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. \ No newline at end of file diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/__init__.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_cls.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_cls.py new file mode 100755 index 0000000000000000000000000000000000000000..5dea3390a6d8bbeb41d8b765eeab38d3fae4ef65 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_cls.py @@ -0,0 +1,106 @@ +import cv2 +import copy +import numpy as np +import math +import time +import torch +from ...pytorchocr.base_ocr_v20 import BaseOCRV20 +from . import pytorchocr_utility as utility +from ...pytorchocr.postprocess import build_post_process + + +class TextClassifier(BaseOCRV20): + def __init__(self, args, **kwargs): + self.device = args.device + self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")] + self.cls_batch_num = args.cls_batch_num + self.cls_thresh = args.cls_thresh + postprocess_params = { + 'name': 'ClsPostProcess', + "label_list": args.label_list, + } + self.postprocess_op = build_post_process(postprocess_params) + + self.weights_path = args.cls_model_path + self.yaml_path = args.cls_yaml_path + network_config = utility.get_arch_config(self.weights_path) + super(TextClassifier, self).__init__(network_config, **kwargs) + + self.cls_image_shape = [int(v) for v in args.cls_image_shape.split(",")] + + self.limited_max_width = args.limited_max_width + self.limited_min_width = args.limited_min_width + + self.load_pytorch_weights(self.weights_path) + self.net.eval() + self.net.to(self.device) + + def resize_norm_img(self, img): + imgC, imgH, imgW = self.cls_image_shape + h = img.shape[0] + w = img.shape[1] + ratio = w / float(h) + imgW = max(min(imgW, self.limited_max_width), self.limited_min_width) + ratio_imgH = math.ceil(imgH * ratio) + ratio_imgH = max(ratio_imgH, self.limited_min_width) + if ratio_imgH > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + if self.cls_image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + return padding_im + + def __call__(self, img_list): + img_list = copy.deepcopy(img_list) + img_num = len(img_list) + # Calculate the aspect ratio of all text bars + width_list = [] + for img in img_list: + width_list.append(img.shape[1] / float(img.shape[0])) + # Sorting can speed up the cls process + indices = np.argsort(np.array(width_list)) + + cls_res = [['', 0.0]] * img_num + batch_num = self.cls_batch_num + elapse = 0 + for beg_img_no in range(0, img_num, batch_num): + end_img_no = min(img_num, beg_img_no + batch_num) + norm_img_batch = [] + max_wh_ratio = 0 + for ino in range(beg_img_no, end_img_no): + h, w = img_list[indices[ino]].shape[0:2] + wh_ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, wh_ratio) + for ino in range(beg_img_no, end_img_no): + norm_img = self.resize_norm_img(img_list[indices[ino]]) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_img_batch = np.concatenate(norm_img_batch) + norm_img_batch = norm_img_batch.copy() + starttime = time.time() + + with torch.no_grad(): + inp = torch.from_numpy(norm_img_batch) + inp = inp.to(self.device) + prob_out = self.net(inp) + prob_out = prob_out.cpu().numpy() + + cls_result = self.postprocess_op(prob_out) + elapse += time.time() - starttime + for rno in range(len(cls_result)): + label, score = cls_result[rno] + cls_res[indices[beg_img_no + rno]] = [label, score] + if '180' in label and score > self.cls_thresh: + img_list[indices[beg_img_no + rno]] = cv2.rotate( + img_list[indices[beg_img_no + rno]], 1) + return img_list, cls_res, elapse diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_det.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_det.py new file mode 100755 index 0000000000000000000000000000000000000000..3de483acf1cd8c09b5d11bbfd738632b88594382 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_det.py @@ -0,0 +1,339 @@ +import sys + +import numpy as np +import time +import torch +from ...pytorchocr.base_ocr_v20 import BaseOCRV20 +from . import pytorchocr_utility as utility +from ...pytorchocr.data import create_operators, transform +from ...pytorchocr.postprocess import build_post_process + + +class TextDetector(BaseOCRV20): + def __init__(self, args, **kwargs): + self.args = args + self.det_algorithm = args.det_algorithm + self.device = args.device + pre_process_list = [{ + 'DetResizeForTest': { + 'limit_side_len': args.det_limit_side_len, + 'limit_type': args.det_limit_type, + } + }, { + 'NormalizeImage': { + 'std': [0.229, 0.224, 0.225], + 'mean': [0.485, 0.456, 0.406], + 'scale': '1./255.', + 'order': 'hwc' + } + }, { + 'ToCHWImage': None + }, { + 'KeepKeys': { + 'keep_keys': ['image', 'shape'] + } + }] + postprocess_params = {} + if self.det_algorithm == "DB": + postprocess_params['name'] = 'DBPostProcess' + postprocess_params["thresh"] = args.det_db_thresh + postprocess_params["box_thresh"] = args.det_db_box_thresh + postprocess_params["max_candidates"] = 1000 + postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio + postprocess_params["use_dilation"] = args.use_dilation + postprocess_params["score_mode"] = args.det_db_score_mode + elif self.det_algorithm == "DB++": + postprocess_params['name'] = 'DBPostProcess' + postprocess_params["thresh"] = args.det_db_thresh + postprocess_params["box_thresh"] = args.det_db_box_thresh + postprocess_params["max_candidates"] = 1000 + postprocess_params["unclip_ratio"] = args.det_db_unclip_ratio + postprocess_params["use_dilation"] = args.use_dilation + postprocess_params["score_mode"] = args.det_db_score_mode + pre_process_list[1] = { + 'NormalizeImage': { + 'std': [1.0, 1.0, 1.0], + 'mean': + [0.48109378172549, 0.45752457890196, 0.40787054090196], + 'scale': '1./255.', + 'order': 'hwc' + } + } + elif self.det_algorithm == "EAST": + postprocess_params['name'] = 'EASTPostProcess' + postprocess_params["score_thresh"] = args.det_east_score_thresh + postprocess_params["cover_thresh"] = args.det_east_cover_thresh + postprocess_params["nms_thresh"] = args.det_east_nms_thresh + elif self.det_algorithm == "SAST": + pre_process_list[0] = { + 'DetResizeForTest': { + 'resize_long': args.det_limit_side_len + } + } + postprocess_params['name'] = 'SASTPostProcess' + postprocess_params["score_thresh"] = args.det_sast_score_thresh + postprocess_params["nms_thresh"] = args.det_sast_nms_thresh + self.det_sast_polygon = args.det_sast_polygon + if self.det_sast_polygon: + postprocess_params["sample_pts_num"] = 6 + postprocess_params["expand_scale"] = 1.2 + postprocess_params["shrink_ratio_of_width"] = 0.2 + else: + postprocess_params["sample_pts_num"] = 2 + postprocess_params["expand_scale"] = 1.0 + postprocess_params["shrink_ratio_of_width"] = 0.3 + elif self.det_algorithm == "PSE": + postprocess_params['name'] = 'PSEPostProcess' + postprocess_params["thresh"] = args.det_pse_thresh + postprocess_params["box_thresh"] = args.det_pse_box_thresh + postprocess_params["min_area"] = args.det_pse_min_area + postprocess_params["box_type"] = args.det_pse_box_type + postprocess_params["scale"] = args.det_pse_scale + self.det_pse_box_type = args.det_pse_box_type + elif self.det_algorithm == "FCE": + pre_process_list[0] = { + 'DetResizeForTest': { + 'rescale_img': [1080, 736] + } + } + postprocess_params['name'] = 'FCEPostProcess' + postprocess_params["scales"] = args.scales + postprocess_params["alpha"] = args.alpha + postprocess_params["beta"] = args.beta + postprocess_params["fourier_degree"] = args.fourier_degree + postprocess_params["box_type"] = args.det_fce_box_type + else: + print("unknown det_algorithm:{}".format(self.det_algorithm)) + sys.exit(0) + + self.preprocess_op = create_operators(pre_process_list) + self.postprocess_op = build_post_process(postprocess_params) + + self.weights_path = args.det_model_path + self.yaml_path = args.det_yaml_path + network_config = utility.get_arch_config(self.weights_path) + super(TextDetector, self).__init__(network_config, **kwargs) + self.load_pytorch_weights(self.weights_path) + self.net.eval() + self.net.to(self.device) + + def _batch_process_same_size(self, img_list): + """ + 对相同尺寸的图像进行批处理 + + Args: + img_list: 相同尺寸的图像列表 + + Returns: + batch_results: 批处理结果列表 + total_elapse: 总耗时 + """ + starttime = time.time() + + # 预处理所有图像 + batch_data = [] + batch_shapes = [] + ori_imgs = [] + + for img in img_list: + ori_im = img.copy() + ori_imgs.append(ori_im) + + data = {'image': img} + data = transform(data, self.preprocess_op) + if data is None: + # 如果预处理失败,返回空结果 + return [(None, 0) for _ in img_list], 0 + + img_processed, shape_list = data + batch_data.append(img_processed) + batch_shapes.append(shape_list) + + # 堆叠成批处理张量 + try: + batch_tensor = np.stack(batch_data, axis=0) + batch_shapes = np.stack(batch_shapes, axis=0) + except Exception as e: + # 如果堆叠失败,回退到逐个处理 + batch_results = [] + for img in img_list: + dt_boxes, elapse = self.__call__(img) + batch_results.append((dt_boxes, elapse)) + return batch_results, time.time() - starttime + + # 批处理推理 + with torch.no_grad(): + inp = torch.from_numpy(batch_tensor) + inp = inp.to(self.device) + outputs = self.net(inp) + + # 处理输出 + preds = {} + if self.det_algorithm == "EAST": + preds['f_geo'] = outputs['f_geo'].cpu().numpy() + preds['f_score'] = outputs['f_score'].cpu().numpy() + elif self.det_algorithm == 'SAST': + preds['f_border'] = outputs['f_border'].cpu().numpy() + preds['f_score'] = outputs['f_score'].cpu().numpy() + preds['f_tco'] = outputs['f_tco'].cpu().numpy() + preds['f_tvo'] = outputs['f_tvo'].cpu().numpy() + elif self.det_algorithm in ['DB', 'PSE', 'DB++']: + preds['maps'] = outputs['maps'].cpu().numpy() + elif self.det_algorithm == 'FCE': + for i, (k, output) in enumerate(outputs.items()): + preds['level_{}'.format(i)] = output.cpu().numpy() + else: + raise NotImplementedError + + # 后处理每个图像的结果 + batch_results = [] + total_elapse = time.time() - starttime + + for i in range(len(img_list)): + # 提取单个图像的预测结果 + single_preds = {} + for key, value in preds.items(): + if isinstance(value, np.ndarray): + single_preds[key] = value[i:i + 1] # 保持批次维度 + else: + single_preds[key] = value + + # 后处理 + post_result = self.postprocess_op(single_preds, batch_shapes[i:i + 1]) + dt_boxes = post_result[0]['points'] + + # 过滤和裁剪检测框 + if (self.det_algorithm == "SAST" and + self.det_sast_polygon) or (self.det_algorithm in ["PSE", "FCE"] and + self.postprocess_op.box_type == 'poly'): + dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_imgs[i].shape) + else: + dt_boxes = self.filter_tag_det_res(dt_boxes, ori_imgs[i].shape) + + batch_results.append((dt_boxes, total_elapse / len(img_list))) + + return batch_results, total_elapse + + def batch_predict(self, img_list, max_batch_size=8): + """ + 批处理预测方法,支持多张图像同时检测 + + Args: + img_list: 图像列表 + max_batch_size: 最大批处理大小 + + Returns: + batch_results: 批处理结果列表,每个元素为(dt_boxes, elapse) + """ + if not img_list: + return [] + + batch_results = [] + + # 分批处理 + for i in range(0, len(img_list), max_batch_size): + batch_imgs = img_list[i:i + max_batch_size] + # assert尺寸一致 + batch_dt_boxes, batch_elapse = self._batch_process_same_size(batch_imgs) + batch_results.extend(batch_dt_boxes) + + return batch_results + + def order_points_clockwise(self, pts): + """ + reference from: https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py + # sort the points based on their x-coordinates + """ + xSorted = pts[np.argsort(pts[:, 0]), :] + + # grab the left-most and right-most points from the sorted + # x-roodinate points + leftMost = xSorted[:2, :] + rightMost = xSorted[2:, :] + + # now, sort the left-most coordinates according to their + # y-coordinates so we can grab the top-left and bottom-left + # points, respectively + leftMost = leftMost[np.argsort(leftMost[:, 1]), :] + (tl, bl) = leftMost + + rightMost = rightMost[np.argsort(rightMost[:, 1]), :] + (tr, br) = rightMost + + rect = np.array([tl, tr, br, bl], dtype="float32") + return rect + + def clip_det_res(self, points, img_height, img_width): + for pno in range(points.shape[0]): + points[pno, 0] = int(min(max(points[pno, 0], 0), img_width - 1)) + points[pno, 1] = int(min(max(points[pno, 1], 0), img_height - 1)) + return points + + def filter_tag_det_res(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + box = self.order_points_clockwise(box) + box = self.clip_det_res(box, img_height, img_width) + rect_width = int(np.linalg.norm(box[0] - box[1])) + rect_height = int(np.linalg.norm(box[0] - box[3])) + if rect_width <= 3 or rect_height <= 3: + continue + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def filter_tag_det_res_only_clip(self, dt_boxes, image_shape): + img_height, img_width = image_shape[0:2] + dt_boxes_new = [] + for box in dt_boxes: + box = self.clip_det_res(box, img_height, img_width) + dt_boxes_new.append(box) + dt_boxes = np.array(dt_boxes_new) + return dt_boxes + + def __call__(self, img): + ori_im = img.copy() + data = {'image': img} + data = transform(data, self.preprocess_op) + img, shape_list = data + if img is None: + return None, 0 + img = np.expand_dims(img, axis=0) + shape_list = np.expand_dims(shape_list, axis=0) + img = img.copy() + starttime = time.time() + + with torch.no_grad(): + inp = torch.from_numpy(img) + inp = inp.to(self.device) + outputs = self.net(inp) + + preds = {} + if self.det_algorithm == "EAST": + preds['f_geo'] = outputs['f_geo'].cpu().numpy() + preds['f_score'] = outputs['f_score'].cpu().numpy() + elif self.det_algorithm == 'SAST': + preds['f_border'] = outputs['f_border'].cpu().numpy() + preds['f_score'] = outputs['f_score'].cpu().numpy() + preds['f_tco'] = outputs['f_tco'].cpu().numpy() + preds['f_tvo'] = outputs['f_tvo'].cpu().numpy() + elif self.det_algorithm in ['DB', 'PSE', 'DB++']: + preds['maps'] = outputs['maps'].cpu().numpy() + elif self.det_algorithm == 'FCE': + for i, (k, output) in enumerate(outputs.items()): + preds['level_{}'.format(i)] = output + else: + raise NotImplementedError + + post_result = self.postprocess_op(preds, shape_list) + dt_boxes = post_result[0]['points'] + if (self.det_algorithm == "SAST" and + self.det_sast_polygon) or (self.det_algorithm in ["PSE", "FCE"] and + self.postprocess_op.box_type == 'poly'): + dt_boxes = self.filter_tag_det_res_only_clip(dt_boxes, ori_im.shape) + else: + dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape) + + elapse = time.time() - starttime + return dt_boxes, elapse diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_rec.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_rec.py new file mode 100755 index 0000000000000000000000000000000000000000..c06ca5fe3f5bd0c4e38502ff548e2b488eeac233 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_rec.py @@ -0,0 +1,446 @@ +from PIL import Image +import cv2 +import numpy as np +import math +import time +import torch +from tqdm import tqdm + +from ...pytorchocr.base_ocr_v20 import BaseOCRV20 +from . import pytorchocr_utility as utility +from ...pytorchocr.postprocess import build_post_process + + +class TextRecognizer(BaseOCRV20): + def __init__(self, args, **kwargs): + self.device = args.device + self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")] + self.character_type = args.rec_char_type + self.rec_batch_num = args.rec_batch_num + self.rec_algorithm = args.rec_algorithm + self.max_text_length = args.max_text_length + postprocess_params = { + 'name': 'CTCLabelDecode', + "character_type": args.rec_char_type, + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + if self.rec_algorithm == "SRN": + postprocess_params = { + 'name': 'SRNLabelDecode', + "character_type": args.rec_char_type, + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "RARE": + postprocess_params = { + 'name': 'AttnLabelDecode', + "character_type": args.rec_char_type, + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'NRTR': + postprocess_params = { + 'name': 'NRTRLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "SAR": + postprocess_params = { + 'name': 'SARLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'ViTSTR': + postprocess_params = { + 'name': 'ViTSTRLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == "CAN": + self.inverse = args.rec_image_inverse + postprocess_params = { + 'name': 'CANLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } + elif self.rec_algorithm == 'RFL': + postprocess_params = { + 'name': 'RFLLabelDecode', + "character_dict_path": None, + "use_space_char": args.use_space_char + } + self.postprocess_op = build_post_process(postprocess_params) + + self.limited_max_width = args.limited_max_width + self.limited_min_width = args.limited_min_width + + self.weights_path = args.rec_model_path + self.yaml_path = args.rec_yaml_path + + network_config = utility.get_arch_config(self.weights_path) + weights = self.read_pytorch_weights(self.weights_path) + + self.out_channels = self.get_out_channels(weights) + if self.rec_algorithm == 'NRTR': + self.out_channels = list(weights.values())[-1].numpy().shape[0] + elif self.rec_algorithm == 'SAR': + self.out_channels = list(weights.values())[-3].numpy().shape[0] + + kwargs['out_channels'] = self.out_channels + super(TextRecognizer, self).__init__(network_config, **kwargs) + + self.load_state_dict(weights) + self.net.eval() + self.net.to(self.device) + + def resize_norm_img(self, img, max_wh_ratio): + imgC, imgH, imgW = self.rec_image_shape + if self.rec_algorithm == 'NRTR' or self.rec_algorithm == 'ViTSTR': + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + # return padding_im + image_pil = Image.fromarray(np.uint8(img)) + if self.rec_algorithm == 'ViTSTR': + img = image_pil.resize([imgW, imgH], Image.BICUBIC) + else: + img = image_pil.resize([imgW, imgH], Image.ANTIALIAS) + img = np.array(img) + norm_img = np.expand_dims(img, -1) + norm_img = norm_img.transpose((2, 0, 1)) + if self.rec_algorithm == 'ViTSTR': + norm_img = norm_img.astype(np.float32) / 255. + else: + norm_img = norm_img.astype(np.float32) / 128. - 1. + return norm_img + elif self.rec_algorithm == 'RFL': + img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_CUBIC) + resized_image = resized_image.astype('float32') + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + resized_image -= 0.5 + resized_image /= 0.5 + return resized_image + + assert imgC == img.shape[2] + max_wh_ratio = max(max_wh_ratio, imgW / imgH) + imgW = int((imgH * max_wh_ratio)) + imgW = max(min(imgW, self.limited_max_width), self.limited_min_width) + h, w = img.shape[:2] + ratio = w / float(h) + ratio_imgH = math.ceil(imgH * ratio) + ratio_imgH = max(ratio_imgH, self.limited_min_width) + if ratio_imgH > imgW: + resized_w = imgW + else: + resized_w = int(ratio_imgH) + resized_image = cv2.resize(img, (resized_w, imgH)) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) + padding_im[:, :, 0:resized_w] = resized_image + return padding_im + + def resize_norm_img_svtr(self, img, image_shape): + + imgC, imgH, imgW = image_shape + resized_image = cv2.resize( + img, (imgW, imgH), interpolation=cv2.INTER_LINEAR) + resized_image = resized_image.astype('float32') + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + return resized_image + + + def resize_norm_img_srn(self, img, image_shape): + imgC, imgH, imgW = image_shape + + img_black = np.zeros((imgH, imgW)) + im_hei = img.shape[0] + im_wid = img.shape[1] + + if im_wid <= im_hei * 1: + img_new = cv2.resize(img, (imgH * 1, imgH)) + elif im_wid <= im_hei * 2: + img_new = cv2.resize(img, (imgH * 2, imgH)) + elif im_wid <= im_hei * 3: + img_new = cv2.resize(img, (imgH * 3, imgH)) + else: + img_new = cv2.resize(img, (imgW, imgH)) + + img_np = np.asarray(img_new) + img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY) + img_black[:, 0:img_np.shape[1]] = img_np + img_black = img_black[:, :, np.newaxis] + + row, col, c = img_black.shape + c = 1 + + return np.reshape(img_black, (c, row, col)).astype(np.float32) + + def srn_other_inputs(self, image_shape, num_heads, max_text_length): + + imgC, imgH, imgW = image_shape + feature_dim = int((imgH / 8) * (imgW / 8)) + + encoder_word_pos = np.array(range(0, feature_dim)).reshape( + (feature_dim, 1)).astype('int64') + gsrm_word_pos = np.array(range(0, max_text_length)).reshape( + (max_text_length, 1)).astype('int64') + + gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length)) + gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape( + [-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias1 = np.tile( + gsrm_slf_attn_bias1, + [1, num_heads, 1, 1]).astype('float32') * [-1e9] + + gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape( + [-1, 1, max_text_length, max_text_length]) + gsrm_slf_attn_bias2 = np.tile( + gsrm_slf_attn_bias2, + [1, num_heads, 1, 1]).astype('float32') * [-1e9] + + encoder_word_pos = encoder_word_pos[np.newaxis, :] + gsrm_word_pos = gsrm_word_pos[np.newaxis, :] + + return [ + encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2 + ] + + def process_image_srn(self, img, image_shape, num_heads, max_text_length): + norm_img = self.resize_norm_img_srn(img, image_shape) + norm_img = norm_img[np.newaxis, :] + + [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \ + self.srn_other_inputs(image_shape, num_heads, max_text_length) + + gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32) + gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32) + encoder_word_pos = encoder_word_pos.astype(np.int64) + gsrm_word_pos = gsrm_word_pos.astype(np.int64) + + return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, + gsrm_slf_attn_bias2) + + def resize_norm_img_sar(self, img, image_shape, + width_downsample_ratio=0.25): + imgC, imgH, imgW_min, imgW_max = image_shape + h = img.shape[0] + w = img.shape[1] + valid_ratio = 1.0 + # make sure new_width is an integral multiple of width_divisor. + width_divisor = int(1 / width_downsample_ratio) + # resize + ratio = w / float(h) + resize_w = math.ceil(imgH * ratio) + if resize_w % width_divisor != 0: + resize_w = round(resize_w / width_divisor) * width_divisor + if imgW_min is not None: + resize_w = max(imgW_min, resize_w) + if imgW_max is not None: + valid_ratio = min(1.0, 1.0 * resize_w / imgW_max) + resize_w = min(imgW_max, resize_w) + resized_image = cv2.resize(img, (resize_w, imgH)) + resized_image = resized_image.astype('float32') + # norm + if image_shape[0] == 1: + resized_image = resized_image / 255 + resized_image = resized_image[np.newaxis, :] + else: + resized_image = resized_image.transpose((2, 0, 1)) / 255 + resized_image -= 0.5 + resized_image /= 0.5 + resize_shape = resized_image.shape + padding_im = -1.0 * np.ones((imgC, imgH, imgW_max), dtype=np.float32) + padding_im[:, :, 0:resize_w] = resized_image + pad_shape = padding_im.shape + + return padding_im, resize_shape, pad_shape, valid_ratio + + + def norm_img_can(self, img, image_shape): + + img = cv2.cvtColor( + img, cv2.COLOR_BGR2GRAY) # CAN only predict gray scale image + + if self.inverse: + img = 255 - img + + if self.rec_image_shape[0] == 1: + h, w = img.shape + _, imgH, imgW = self.rec_image_shape + if h < imgH or w < imgW: + padding_h = max(imgH - h, 0) + padding_w = max(imgW - w, 0) + img_padded = np.pad(img, ((0, padding_h), (0, padding_w)), + 'constant', + constant_values=(255)) + img = img_padded + + img = np.expand_dims(img, 0) / 255.0 # h,w,c -> c,h,w + img = img.astype('float32') + + return img + + def __call__(self, img_list, tqdm_enable=False): + img_num = len(img_list) + # Calculate the aspect ratio of all text bars + width_list = [] + for img in img_list: + width_list.append(img.shape[1] / float(img.shape[0])) + # Sorting can speed up the recognition process + indices = np.argsort(np.array(width_list)) + + # rec_res = [] + rec_res = [['', 0.0]] * img_num + batch_num = self.rec_batch_num + elapse = 0 + # for beg_img_no in range(0, img_num, batch_num): + with tqdm(total=img_num, desc='OCR-rec Predict', disable=not tqdm_enable) as pbar: + index = 0 + for beg_img_no in range(0, img_num, batch_num): + end_img_no = min(img_num, beg_img_no + batch_num) + norm_img_batch = [] + max_wh_ratio = 0 + for ino in range(beg_img_no, end_img_no): + # h, w = img_list[ino].shape[0:2] + h, w = img_list[indices[ino]].shape[0:2] + wh_ratio = w * 1.0 / h + max_wh_ratio = max(max_wh_ratio, wh_ratio) + for ino in range(beg_img_no, end_img_no): + if self.rec_algorithm == "SAR": + norm_img, _, _, valid_ratio = self.resize_norm_img_sar( + img_list[indices[ino]], self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + valid_ratio = np.expand_dims(valid_ratio, axis=0) + valid_ratios = [] + valid_ratios.append(valid_ratio) + norm_img_batch.append(norm_img) + + elif self.rec_algorithm == "SVTR": + norm_img = self.resize_norm_img_svtr(img_list[indices[ino]], + self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + elif self.rec_algorithm == "SRN": + norm_img = self.process_image_srn(img_list[indices[ino]], + self.rec_image_shape, 8, + self.max_text_length) + encoder_word_pos_list = [] + gsrm_word_pos_list = [] + gsrm_slf_attn_bias1_list = [] + gsrm_slf_attn_bias2_list = [] + encoder_word_pos_list.append(norm_img[1]) + gsrm_word_pos_list.append(norm_img[2]) + gsrm_slf_attn_bias1_list.append(norm_img[3]) + gsrm_slf_attn_bias2_list.append(norm_img[4]) + norm_img_batch.append(norm_img[0]) + elif self.rec_algorithm == "CAN": + norm_img = self.norm_img_can(img_list[indices[ino]], + max_wh_ratio) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_image_mask = np.ones(norm_img.shape, dtype='float32') + word_label = np.ones([1, 36], dtype='int64') + norm_img_mask_batch = [] + word_label_list = [] + norm_img_mask_batch.append(norm_image_mask) + word_label_list.append(word_label) + else: + norm_img = self.resize_norm_img(img_list[indices[ino]], + max_wh_ratio) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) + norm_img_batch = np.concatenate(norm_img_batch) + norm_img_batch = norm_img_batch.copy() + + if self.rec_algorithm == "SRN": + starttime = time.time() + encoder_word_pos_list = np.concatenate(encoder_word_pos_list) + gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list) + gsrm_slf_attn_bias1_list = np.concatenate( + gsrm_slf_attn_bias1_list) + gsrm_slf_attn_bias2_list = np.concatenate( + gsrm_slf_attn_bias2_list) + + with torch.no_grad(): + inp = torch.from_numpy(norm_img_batch) + encoder_word_pos_inp = torch.from_numpy(encoder_word_pos_list) + gsrm_word_pos_inp = torch.from_numpy(gsrm_word_pos_list) + gsrm_slf_attn_bias1_inp = torch.from_numpy(gsrm_slf_attn_bias1_list) + gsrm_slf_attn_bias2_inp = torch.from_numpy(gsrm_slf_attn_bias2_list) + + inp = inp.to(self.device) + encoder_word_pos_inp = encoder_word_pos_inp.to(self.device) + gsrm_word_pos_inp = gsrm_word_pos_inp.to(self.device) + gsrm_slf_attn_bias1_inp = gsrm_slf_attn_bias1_inp.to(self.device) + gsrm_slf_attn_bias2_inp = gsrm_slf_attn_bias2_inp.to(self.device) + + backbone_out = self.net.backbone(inp) # backbone_feat + prob_out = self.net.head(backbone_out, [encoder_word_pos_inp, gsrm_word_pos_inp, gsrm_slf_attn_bias1_inp, gsrm_slf_attn_bias2_inp]) + # preds = {"predict": prob_out[2]} + preds = {"predict": prob_out["predict"]} + + elif self.rec_algorithm == "SAR": + starttime = time.time() + # valid_ratios = np.concatenate(valid_ratios) + # inputs = [ + # norm_img_batch, + # valid_ratios, + # ] + + with torch.no_grad(): + inp = torch.from_numpy(norm_img_batch) + inp = inp.to(self.device) + preds = self.net(inp) + + elif self.rec_algorithm == "CAN": + starttime = time.time() + norm_img_mask_batch = np.concatenate(norm_img_mask_batch) + word_label_list = np.concatenate(word_label_list) + inputs = [norm_img_batch, norm_img_mask_batch, word_label_list] + + inp = [torch.from_numpy(e_i) for e_i in inputs] + inp = [e_i.to(self.device) for e_i in inp] + with torch.no_grad(): + outputs = self.net(inp) + outputs = [v.cpu().numpy() for k, v in enumerate(outputs)] + + preds = outputs + + else: + starttime = time.time() + + with torch.no_grad(): + inp = torch.from_numpy(norm_img_batch) + inp = inp.to(self.device) + prob_out = self.net(inp) + + if isinstance(prob_out, list): + preds = [v.cpu().numpy() for v in prob_out] + else: + preds = prob_out.cpu().numpy() + + rec_result = self.postprocess_op(preds) + for rno in range(len(rec_result)): + rec_res[indices[beg_img_no + rno]] = rec_result[rno] + elapse += time.time() - starttime + + # 更新进度条,每次增加batch_size,但要注意最后一个batch可能不足batch_size + current_batch_size = min(batch_num, img_num - index * batch_num) + index += 1 + pbar.update(current_batch_size) + + # Fix NaN values in recognition results + for i in range(len(rec_res)): + text, score = rec_res[i] + if isinstance(score, float) and math.isnan(score): + rec_res[i] = (text, 0.0) + + return rec_res, elapse diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_system.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_system.py new file mode 100755 index 0000000000000000000000000000000000000000..e35b9a4b1535ad89d7df2e2be6d31c5475d2acb2 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/predict_system.py @@ -0,0 +1,104 @@ +import cv2 +import copy +import numpy as np + +from . import predict_rec +from . import predict_det +from . import predict_cls + + +class TextSystem(object): + def __init__(self, args, **kwargs): + self.text_detector = predict_det.TextDetector(args, **kwargs) + self.text_recognizer = predict_rec.TextRecognizer(args, **kwargs) + self.use_angle_cls = args.use_angle_cls + self.drop_score = args.drop_score + if self.use_angle_cls: + self.text_classifier = predict_cls.TextClassifier(args, **kwargs) + + def get_rotate_crop_image(self, img, points): + ''' + img_height, img_width = img.shape[0:2] + left = int(np.min(points[:, 0])) + right = int(np.max(points[:, 0])) + top = int(np.min(points[:, 1])) + bottom = int(np.max(points[:, 1])) + img_crop = img[top:bottom, left:right, :].copy() + points[:, 0] = points[:, 0] - left + points[:, 1] = points[:, 1] - top + ''' + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + return dst_img + + def __call__(self, img): + ori_im = img.copy() + dt_boxes, elapse = self.text_detector(img) + print("dt_boxes num : {}, elapse : {}".format( + len(dt_boxes), elapse)) + if dt_boxes is None: + return None, None + img_crop_list = [] + + dt_boxes = sorted_boxes(dt_boxes) + + for bno in range(len(dt_boxes)): + tmp_box = copy.deepcopy(dt_boxes[bno]) + img_crop = self.get_rotate_crop_image(ori_im, tmp_box) + img_crop_list.append(img_crop) + if self.use_angle_cls: + img_crop_list, angle_list, elapse = self.text_classifier( + img_crop_list) + print("cls num : {}, elapse : {}".format( + len(img_crop_list), elapse)) + + rec_res, elapse = self.text_recognizer(img_crop_list) + print("rec_res num : {}, elapse : {}".format( + len(rec_res), elapse)) + # self.print_draw_crop_rec_res(img_crop_list, rec_res) + filter_boxes, filter_rec_res = [], [] + for box, rec_reuslt in zip(dt_boxes, rec_res): + text, score = rec_reuslt + if score >= self.drop_score: + filter_boxes.append(box) + filter_rec_res.append(rec_reuslt) + return filter_boxes, filter_rec_res + + +def sorted_boxes(dt_boxes): + """ + Sort text boxes in order from top to bottom, left to right + args: + dt_boxes(array):detected text boxes with shape [4, 2] + return: + sorted boxes(array) with shape [4, 2] + """ + num_boxes = dt_boxes.shape[0] + sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) + _boxes = list(sorted_boxes) + + for i in range(num_boxes - 1): + if abs(_boxes[i + 1][0][1] - _boxes[i][0][1]) < 10 and \ + (_boxes[i + 1][0][0] < _boxes[i][0][0]): + tmp = _boxes[i] + _boxes[i] = _boxes[i + 1] + _boxes[i + 1] = tmp + return _boxes diff --git a/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py new file mode 100755 index 0000000000000000000000000000000000000000..912d124ef4683740d014d881cc825673d577b628 --- /dev/null +++ b/vendor/mineru/mineru/model/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py @@ -0,0 +1,227 @@ +import os +import math +from pathlib import Path +import numpy as np +import cv2 +import argparse + + +root_dir = Path(__file__).resolve().parent.parent.parent +DEFAULT_CFG_PATH = root_dir / "pytorchocr" / "utils" / "resources" / "arch_config.yaml" + + +def init_args(): + def str2bool(v): + return v.lower() in ("true", "t", "1") + + parser = argparse.ArgumentParser() + # params for prediction engine + parser.add_argument("--use_gpu", type=str2bool, default=False) + parser.add_argument("--det", type=str2bool, default=True) + parser.add_argument("--rec", type=str2bool, default=True) + parser.add_argument("--device", type=str, default='cpu') + # parser.add_argument("--ir_optim", type=str2bool, default=True) + # parser.add_argument("--use_tensorrt", type=str2bool, default=False) + # parser.add_argument("--use_fp16", type=str2bool, default=False) + parser.add_argument("--gpu_mem", type=int, default=500) + parser.add_argument("--warmup", type=str2bool, default=False) + + # params for text detector + parser.add_argument("--image_dir", type=str) + parser.add_argument("--det_algorithm", type=str, default='DB') + parser.add_argument("--det_model_path", type=str) + parser.add_argument("--det_limit_side_len", type=float, default=960) + parser.add_argument("--det_limit_type", type=str, default='max') + + # DB parmas + parser.add_argument("--det_db_thresh", type=float, default=0.3) + parser.add_argument("--det_db_box_thresh", type=float, default=0.6) + parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5) + parser.add_argument("--max_batch_size", type=int, default=10) + parser.add_argument("--use_dilation", type=str2bool, default=False) + parser.add_argument("--det_db_score_mode", type=str, default="fast") + + # EAST parmas + parser.add_argument("--det_east_score_thresh", type=float, default=0.8) + parser.add_argument("--det_east_cover_thresh", type=float, default=0.1) + parser.add_argument("--det_east_nms_thresh", type=float, default=0.2) + + # SAST parmas + parser.add_argument("--det_sast_score_thresh", type=float, default=0.5) + parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2) + parser.add_argument("--det_sast_polygon", type=str2bool, default=False) + + # PSE parmas + parser.add_argument("--det_pse_thresh", type=float, default=0) + parser.add_argument("--det_pse_box_thresh", type=float, default=0.85) + parser.add_argument("--det_pse_min_area", type=float, default=16) + parser.add_argument("--det_pse_box_type", type=str, default='box') + parser.add_argument("--det_pse_scale", type=int, default=1) + + # FCE parmas + parser.add_argument("--scales", type=list, default=[8, 16, 32]) + parser.add_argument("--alpha", type=float, default=1.0) + parser.add_argument("--beta", type=float, default=1.0) + parser.add_argument("--fourier_degree", type=int, default=5) + parser.add_argument("--det_fce_box_type", type=str, default='poly') + + # params for text recognizer + parser.add_argument("--rec_algorithm", type=str, default='CRNN') + parser.add_argument("--rec_model_path", type=str) + parser.add_argument("--rec_image_inverse", type=str2bool, default=True) + parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320") + parser.add_argument("--rec_char_type", type=str, default='ch') + parser.add_argument("--rec_batch_num", type=int, default=6) + parser.add_argument("--max_text_length", type=int, default=25) + + parser.add_argument("--use_space_char", type=str2bool, default=True) + parser.add_argument("--drop_score", type=float, default=0.5) + parser.add_argument("--limited_max_width", type=int, default=1280) + parser.add_argument("--limited_min_width", type=int, default=16) + + parser.add_argument( + "--vis_font_path", type=str, + default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'doc/fonts/simfang.ttf')) + parser.add_argument( + "--rec_char_dict_path", + type=str, + default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + 'pytorchocr/utils/ppocr_keys_v1.txt')) + + # params for text classifier + parser.add_argument("--use_angle_cls", type=str2bool, default=False) + parser.add_argument("--cls_model_path", type=str) + parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192") + parser.add_argument("--label_list", type=list, default=['0', '180']) + parser.add_argument("--cls_batch_num", type=int, default=6) + parser.add_argument("--cls_thresh", type=float, default=0.9) + + parser.add_argument("--enable_mkldnn", type=str2bool, default=False) + parser.add_argument("--use_pdserving", type=str2bool, default=False) + + # params for e2e + parser.add_argument("--e2e_algorithm", type=str, default='PGNet') + parser.add_argument("--e2e_model_path", type=str) + parser.add_argument("--e2e_limit_side_len", type=float, default=768) + parser.add_argument("--e2e_limit_type", type=str, default='max') + + # PGNet parmas + parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5) + parser.add_argument( + "--e2e_char_dict_path", type=str, + default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + 'pytorchocr/utils/ic15_dict.txt')) + parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext') + parser.add_argument("--e2e_pgnet_polygon", type=bool, default=True) + parser.add_argument("--e2e_pgnet_mode", type=str, default='fast') + + # SR parmas + parser.add_argument("--sr_model_path", type=str) + parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128") + parser.add_argument("--sr_batch_num", type=int, default=1) + + # params .yaml + parser.add_argument("--det_yaml_path", type=str, default=None) + parser.add_argument("--rec_yaml_path", type=str, default=None) + parser.add_argument("--cls_yaml_path", type=str, default=None) + parser.add_argument("--e2e_yaml_path", type=str, default=None) + parser.add_argument("--sr_yaml_path", type=str, default=None) + + # multi-process + parser.add_argument("--use_mp", type=str2bool, default=False) + parser.add_argument("--total_process_num", type=int, default=1) + parser.add_argument("--process_id", type=int, default=0) + + parser.add_argument("--benchmark", type=str2bool, default=False) + parser.add_argument("--save_log_path", type=str, default="./log_output/") + + parser.add_argument("--show_log", type=str2bool, default=True) + + return parser + +def parse_args(): + parser = init_args() + return parser.parse_args() + +def get_default_config(args): + return vars(args) + + +def read_network_config_from_yaml(yaml_path, char_num=None): + if not os.path.exists(yaml_path): + raise FileNotFoundError('{} is not existed.'.format(yaml_path)) + import yaml + with open(yaml_path, encoding='utf-8') as f: + res = yaml.safe_load(f) + if res.get('Architecture') is None: + raise ValueError('{} has no Architecture'.format(yaml_path)) + if res['Architecture']['Head']['name'] == 'MultiHead' and char_num is not None: + res['Architecture']['Head']['out_channels_list'] = { + 'CTCLabelDecode': char_num, + 'SARLabelDecode': char_num + 2, + 'NRTRLabelDecode': char_num + 3 + } + return res['Architecture'] + +def AnalysisConfig(weights_path, yaml_path=None, char_num=None): + if not os.path.exists(os.path.abspath(weights_path)): + raise FileNotFoundError('{} is not found.'.format(weights_path)) + + if yaml_path is not None: + return read_network_config_from_yaml(yaml_path, char_num=char_num) + + +def resize_img(img, input_size=600): + """ + resize img and limit the longest side of the image to input_size + """ + img = np.array(img) + im_shape = img.shape + im_size_max = np.max(im_shape[0:2]) + im_scale = float(input_size) / float(im_size_max) + img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale) + return img + + +def str_count(s): + """ + Count the number of Chinese characters, + a single English character and a single number + equal to half the length of Chinese characters. + args: + s(string): the input of string + return(int): + the number of Chinese characters + """ + import string + count_zh = count_pu = 0 + s_len = len(s) + en_dg_count = 0 + for c in s: + if c in string.ascii_letters or c.isdigit() or c.isspace(): + en_dg_count += 1 + elif c.isalpha(): + count_zh += 1 + else: + count_pu += 1 + return s_len - math.ceil(en_dg_count / 2) + + +def base64_to_cv2(b64str): + import base64 + data = base64.b64decode(b64str.encode('utf8')) + data = np.fromstring(data, np.uint8) + data = cv2.imdecode(data, cv2.IMREAD_COLOR) + return data + + +def get_arch_config(model_path): + from omegaconf import OmegaConf + all_arch_config = OmegaConf.load(DEFAULT_CFG_PATH) + path = Path(model_path) + file_name = path.stem + if file_name not in all_arch_config: + raise ValueError(f"architecture {file_name} is not in arch_config.yaml") + + arch_config = all_arch_config[file_name] + return arch_config \ No newline at end of file diff --git a/vendor/mineru/mineru/model/reading_order/__init__.py b/vendor/mineru/mineru/model/reading_order/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/model/reading_order/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/model/reading_order/layout_reader.py b/vendor/mineru/mineru/model/reading_order/layout_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..dfe71a89cf99e1f5807055115ceeda3abbceb363 --- /dev/null +++ b/vendor/mineru/mineru/model/reading_order/layout_reader.py @@ -0,0 +1,125 @@ +from collections import defaultdict +from typing import List, Dict + +import torch +from transformers import LayoutLMv3ForTokenClassification + +MAX_LEN = 510 +CLS_TOKEN_ID = 0 +UNK_TOKEN_ID = 3 +EOS_TOKEN_ID = 2 + + +class DataCollator: + def __call__(self, features: List[dict]) -> Dict[str, torch.Tensor]: + bbox = [] + labels = [] + input_ids = [] + attention_mask = [] + + # clip bbox and labels to max length, build input_ids and attention_mask + for feature in features: + _bbox = feature["source_boxes"] + if len(_bbox) > MAX_LEN: + _bbox = _bbox[:MAX_LEN] + _labels = feature["target_index"] + if len(_labels) > MAX_LEN: + _labels = _labels[:MAX_LEN] + _input_ids = [UNK_TOKEN_ID] * len(_bbox) + _attention_mask = [1] * len(_bbox) + assert len(_bbox) == len(_labels) == len(_input_ids) == len(_attention_mask) + bbox.append(_bbox) + labels.append(_labels) + input_ids.append(_input_ids) + attention_mask.append(_attention_mask) + + # add CLS and EOS tokens + for i in range(len(bbox)): + bbox[i] = [[0, 0, 0, 0]] + bbox[i] + [[0, 0, 0, 0]] + labels[i] = [-100] + labels[i] + [-100] + input_ids[i] = [CLS_TOKEN_ID] + input_ids[i] + [EOS_TOKEN_ID] + attention_mask[i] = [1] + attention_mask[i] + [1] + + # padding to max length + max_len = max(len(x) for x in bbox) + for i in range(len(bbox)): + bbox[i] = bbox[i] + [[0, 0, 0, 0]] * (max_len - len(bbox[i])) + labels[i] = labels[i] + [-100] * (max_len - len(labels[i])) + input_ids[i] = input_ids[i] + [EOS_TOKEN_ID] * (max_len - len(input_ids[i])) + attention_mask[i] = attention_mask[i] + [0] * ( + max_len - len(attention_mask[i]) + ) + + ret = { + "bbox": torch.tensor(bbox), + "attention_mask": torch.tensor(attention_mask), + "labels": torch.tensor(labels), + "input_ids": torch.tensor(input_ids), + } + # set label > MAX_LEN to -100, because original labels may be > MAX_LEN + ret["labels"][ret["labels"] > MAX_LEN] = -100 + # set label > 0 to label-1, because original labels are 1-indexed + ret["labels"][ret["labels"] > 0] -= 1 + return ret + + +def boxes2inputs(boxes: List[List[int]]) -> Dict[str, torch.Tensor]: + bbox = [[0, 0, 0, 0]] + boxes + [[0, 0, 0, 0]] + input_ids = [CLS_TOKEN_ID] + [UNK_TOKEN_ID] * len(boxes) + [EOS_TOKEN_ID] + attention_mask = [1] + [1] * len(boxes) + [1] + return { + "bbox": torch.tensor([bbox]), + "attention_mask": torch.tensor([attention_mask]), + "input_ids": torch.tensor([input_ids]), + } + + +def prepare_inputs( + inputs: Dict[str, torch.Tensor], model: LayoutLMv3ForTokenClassification +) -> Dict[str, torch.Tensor]: + ret = {} + for k, v in inputs.items(): + v = v.to(model.device) + if torch.is_floating_point(v): + v = v.to(model.dtype) + ret[k] = v + return ret + + +def parse_logits(logits: torch.Tensor, length: int) -> List[int]: + """ + parse logits to orders + + :param logits: logits from model + :param length: input length + :return: orders + """ + logits = logits[1 : length + 1, :length] + orders = logits.argsort(descending=False).tolist() + ret = [o.pop() for o in orders] + while True: + order_to_idxes = defaultdict(list) + for idx, order in enumerate(ret): + order_to_idxes[order].append(idx) + # filter idxes len > 1 + order_to_idxes = {k: v for k, v in order_to_idxes.items() if len(v) > 1} + if not order_to_idxes: + break + # filter + for order, idxes in order_to_idxes.items(): + # find original logits of idxes + idxes_to_logit = {} + for idx in idxes: + idxes_to_logit[idx] = logits[idx, order] + idxes_to_logit = sorted( + idxes_to_logit.items(), key=lambda x: x[1], reverse=True + ) + # keep the highest logit as order, set others to next candidate + for idx, _ in idxes_to_logit[1:]: + ret[idx] = orders[idx].pop() + + return ret + + +def check_duplicate(a: List[int]) -> bool: + return len(a) != len(set(a)) diff --git a/vendor/mineru/mineru/model/reading_order/xycut.py b/vendor/mineru/mineru/model/reading_order/xycut.py new file mode 100644 index 0000000000000000000000000000000000000000..7a36f527673f7ba830a768027ae4b7e1659f4b5f --- /dev/null +++ b/vendor/mineru/mineru/model/reading_order/xycut.py @@ -0,0 +1,242 @@ +from typing import List +import cv2 +import numpy as np + + +def projection_by_bboxes(boxes: np.array, axis: int) -> np.ndarray: + """ + 通过一组 bbox 获得投影直方图,最后以 per-pixel 形式输出 + + Args: + boxes: [N, 4] + axis: 0-x坐标向水平方向投影, 1-y坐标向垂直方向投影 + + Returns: + 1D 投影直方图,长度为投影方向坐标的最大值(我们不需要图片的实际边长,因为只是要找文本框的间隔) + + """ + assert axis in [0, 1] + length = np.max(boxes[:, axis::2]) + res = np.zeros(length, dtype=int) + # TODO: how to remove for loop? + for start, end in boxes[:, axis::2]: + res[start:end] += 1 + return res + + +# from: https://dothinking.github.io/2021-06-19-%E9%80%92%E5%BD%92%E6%8A%95%E5%BD%B1%E5%88%86%E5%89%B2%E7%AE%97%E6%B3%95/#:~:text=%E9%80%92%E5%BD%92%E6%8A%95%E5%BD%B1%E5%88%86%E5%89%B2%EF%BC%88Recursive%20XY,%EF%BC%8C%E5%8F%AF%E4%BB%A5%E5%88%92%E5%88%86%E6%AE%B5%E8%90%BD%E3%80%81%E8%A1%8C%E3%80%82 +def split_projection_profile(arr_values: np.array, min_value: float, min_gap: float): + """Split projection profile: + + ``` + ┌──┐ + arr_values │ │ ┌─┐─── + ┌──┐ │ │ │ │ | + │ │ │ │ ┌───┐ │ │min_value + │ │<- min_gap ->│ │ │ │ │ │ | + ────┴──┴─────────────┴──┴─┴───┴─┴─┴─┴─── + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + ``` + + Args: + arr_values (np.array): 1-d array representing the projection profile. + min_value (float): Ignore the profile if `arr_value` is less than `min_value`. + min_gap (float): Ignore the gap if less than this value. + + Returns: + tuple: Start indexes and end indexes of split groups. + """ + # all indexes with projection height exceeding the threshold + arr_index = np.where(arr_values > min_value)[0] + if not len(arr_index): + return + + # find zero intervals between adjacent projections + # | | || + # ||||<- zero-interval -> ||||| + arr_diff = arr_index[1:] - arr_index[0:-1] + arr_diff_index = np.where(arr_diff > min_gap)[0] + arr_zero_intvl_start = arr_index[arr_diff_index] + arr_zero_intvl_end = arr_index[arr_diff_index + 1] + + # convert to index of projection range: + # the start index of zero interval is the end index of projection + arr_start = np.insert(arr_zero_intvl_end, 0, arr_index[0]) + arr_end = np.append(arr_zero_intvl_start, arr_index[-1]) + arr_end += 1 # end index will be excluded as index slice + + return arr_start, arr_end + + +def recursive_xy_cut(boxes: np.ndarray, indices: List[int], res: List[int]): + """ + + Args: + boxes: (N, 4) + indices: 递归过程中始终表示 box 在原始数据中的索引 + res: 保存输出结果 + + """ + # 向 y 轴投影 + assert len(boxes) == len(indices) + + _indices = boxes[:, 1].argsort() + y_sorted_boxes = boxes[_indices] + y_sorted_indices = indices[_indices] + + # debug_vis(y_sorted_boxes, y_sorted_indices) + + y_projection = projection_by_bboxes(boxes=y_sorted_boxes, axis=1) + pos_y = split_projection_profile(y_projection, 0, 1) + if not pos_y: + return + + arr_y0, arr_y1 = pos_y + for r0, r1 in zip(arr_y0, arr_y1): + # [r0, r1] 表示按照水平切分,有 bbox 的区域,对这些区域会再进行垂直切分 + _indices = (r0 <= y_sorted_boxes[:, 1]) & (y_sorted_boxes[:, 1] < r1) + + y_sorted_boxes_chunk = y_sorted_boxes[_indices] + y_sorted_indices_chunk = y_sorted_indices[_indices] + + _indices = y_sorted_boxes_chunk[:, 0].argsort() + x_sorted_boxes_chunk = y_sorted_boxes_chunk[_indices] + x_sorted_indices_chunk = y_sorted_indices_chunk[_indices] + + # 往 x 方向投影 + x_projection = projection_by_bboxes(boxes=x_sorted_boxes_chunk, axis=0) + pos_x = split_projection_profile(x_projection, 0, 1) + if not pos_x: + continue + + arr_x0, arr_x1 = pos_x + if len(arr_x0) == 1: + # x 方向无法切分 + res.extend(x_sorted_indices_chunk) + continue + + # x 方向上能分开,继续递归调用 + for c0, c1 in zip(arr_x0, arr_x1): + _indices = (c0 <= x_sorted_boxes_chunk[:, 0]) & ( + x_sorted_boxes_chunk[:, 0] < c1 + ) + recursive_xy_cut( + x_sorted_boxes_chunk[_indices], x_sorted_indices_chunk[_indices], res + ) + + +def points_to_bbox(points): + assert len(points) == 8 + + # [x1,y1,x2,y2,x3,y3,x4,y4] + left = min(points[::2]) + right = max(points[::2]) + top = min(points[1::2]) + bottom = max(points[1::2]) + + left = max(left, 0) + top = max(top, 0) + right = max(right, 0) + bottom = max(bottom, 0) + return [left, top, right, bottom] + + +def bbox2points(bbox): + left, top, right, bottom = bbox + return [left, top, right, top, right, bottom, left, bottom] + + +def vis_polygon(img, points, thickness=2, color=None): + br2bl_color = color + tl2tr_color = color + tr2br_color = color + bl2tl_color = color + cv2.line( + img, + (points[0][0], points[0][1]), + (points[1][0], points[1][1]), + color=tl2tr_color, + thickness=thickness, + ) + + cv2.line( + img, + (points[1][0], points[1][1]), + (points[2][0], points[2][1]), + color=tr2br_color, + thickness=thickness, + ) + + cv2.line( + img, + (points[2][0], points[2][1]), + (points[3][0], points[3][1]), + color=br2bl_color, + thickness=thickness, + ) + + cv2.line( + img, + (points[3][0], points[3][1]), + (points[0][0], points[0][1]), + color=bl2tl_color, + thickness=thickness, + ) + return img + + +def vis_points( + img: np.ndarray, points, texts: List[str] = None, color=(0, 200, 0) +) -> np.ndarray: + """ + + Args: + img: + points: [N, 8] 8: x1,y1,x2,y2,x3,y3,x3,y4 + texts: + color: + + Returns: + + """ + points = np.array(points) + if texts is not None: + assert len(texts) == points.shape[0] + + for i, _points in enumerate(points): + vis_polygon(img, _points.reshape(-1, 2), thickness=2, color=color) + bbox = points_to_bbox(_points) + left, top, right, bottom = bbox + cx = (left + right) // 2 + cy = (top + bottom) // 2 + + txt = texts[i] + font = cv2.FONT_HERSHEY_SIMPLEX + cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0] + + img = cv2.rectangle( + img, + (cx - 5 * len(txt), cy - cat_size[1] - 5), + (cx - 5 * len(txt) + cat_size[0], cy - 5), + color, + -1, + ) + + img = cv2.putText( + img, + txt, + (cx - 5 * len(txt), cy - 5), + font, + 0.5, + (255, 255, 255), + thickness=1, + lineType=cv2.LINE_AA, + ) + + return img + + +def vis_polygons_with_index(image, points): + texts = [str(i) for i in range(len(points))] + res_img = vis_points(image.copy(), points, texts) + return res_img \ No newline at end of file diff --git a/vendor/mineru/mineru/model/table/__init__.py b/vendor/mineru/mineru/model/table/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/model/table/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/model/table/rapid_table.py b/vendor/mineru/mineru/model/table/rapid_table.py new file mode 100644 index 0000000000000000000000000000000000000000..174a8052ce8a576f3fa95347d02bb7b1bcda4a71 --- /dev/null +++ b/vendor/mineru/mineru/model/table/rapid_table.py @@ -0,0 +1,89 @@ +import os +import html +import cv2 +import numpy as np +from loguru import logger +from rapid_table import RapidTable, RapidTableInput + +from mineru.utils.enum_class import ModelPath +from mineru.utils.models_download_utils import auto_download_and_get_model_root_path + + +def escape_html(input_string): + """Escape HTML Entities.""" + return html.escape(input_string) + + +class RapidTableModel(object): + def __init__(self, ocr_engine): + slanet_plus_model_path = os.path.join(auto_download_and_get_model_root_path(ModelPath.slanet_plus), ModelPath.slanet_plus) + input_args = RapidTableInput(model_type='slanet_plus', model_path=slanet_plus_model_path) + self.table_model = RapidTable(input_args) + self.ocr_engine = ocr_engine + + + def predict(self, image): + bgr_image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) + + # First check the overall image aspect ratio (height/width) + img_height, img_width = bgr_image.shape[:2] + img_aspect_ratio = img_height / img_width if img_width > 0 else 1.0 + img_is_portrait = img_aspect_ratio > 1.2 + + if img_is_portrait: + + det_res = self.ocr_engine.ocr(bgr_image, rec=False)[0] + # Check if table is rotated by analyzing text box aspect ratios + is_rotated = False + if det_res: + vertical_count = 0 + + for box_ocr_res in det_res: + p1, p2, p3, p4 = box_ocr_res + + # Calculate width and height + width = p3[0] - p1[0] + height = p3[1] - p1[1] + + aspect_ratio = width / height if height > 0 else 1.0 + + # Count vertical vs horizontal text boxes + if aspect_ratio < 0.8: # Taller than wide - vertical text + vertical_count += 1 + # elif aspect_ratio > 1.2: # Wider than tall - horizontal text + # horizontal_count += 1 + + # If we have more vertical text boxes than horizontal ones, + # and vertical ones are significant, table might be rotated + if vertical_count >= len(det_res) * 0.3: + is_rotated = True + + # logger.debug(f"Text orientation analysis: vertical={vertical_count}, det_res={len(det_res)}, rotated={is_rotated}") + + # Rotate image if necessary + if is_rotated: + # logger.debug("Table appears to be in portrait orientation, rotating 90 degrees clockwise") + image = cv2.rotate(np.asarray(image), cv2.ROTATE_90_CLOCKWISE) + bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) + + # Continue with OCR on potentially rotated image + ocr_result = self.ocr_engine.ocr(bgr_image)[0] + if ocr_result: + ocr_result = [[item[0], escape_html(item[1][0]), item[1][1]] for item in ocr_result if + len(item) == 2 and isinstance(item[1], tuple)] + else: + ocr_result = None + + + if ocr_result: + try: + table_results = self.table_model(np.asarray(image), ocr_result) + html_code = table_results.pred_html + table_cell_bboxes = table_results.cell_bboxes + logic_points = table_results.logic_points + elapse = table_results.elapse + return html_code, table_cell_bboxes, logic_points, elapse + except Exception as e: + logger.exception(e) + + return None, None, None, None diff --git a/vendor/mineru/mineru/model/vlm_hf_model/__init__.py b/vendor/mineru/mineru/model/vlm_hf_model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d463f72f1f21d24ea180224146678112f80156b0 --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_hf_model/__init__.py @@ -0,0 +1,9 @@ +from transformers import AutoConfig, AutoImageProcessor, AutoModelForCausalLM + +from .configuration_mineru2 import Mineru2QwenConfig +from .image_processing_mineru2 import Mineru2ImageProcessor +from .modeling_mineru2 import Mineru2QwenForCausalLM + +AutoConfig.register(Mineru2QwenConfig.model_type, Mineru2QwenConfig) +AutoModelForCausalLM.register(Mineru2QwenConfig, Mineru2QwenForCausalLM) +AutoImageProcessor.register(Mineru2QwenConfig, slow_image_processor_class=Mineru2ImageProcessor) diff --git a/vendor/mineru/mineru/model/vlm_hf_model/configuration_mineru2.py b/vendor/mineru/mineru/model/vlm_hf_model/configuration_mineru2.py new file mode 100644 index 0000000000000000000000000000000000000000..1ac8f6ae5e12df9b79c822727124d095bec12433 --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_hf_model/configuration_mineru2.py @@ -0,0 +1,38 @@ +from transformers import Qwen2Config + + +class Mineru2QwenConfig(Qwen2Config): + model_type = "mineru2_qwen" + + def __init__( + self, + ignore_index=-100, + image_aspect_ratio="square_anyres_max_9", + image_grid_pinpoints="(1x1),...,(4x4)", + image_token_index=151646, + mm_hidden_size=1152, + mm_patch_merge_type="spatial_unpad", + mm_projector_type="mlp2x_gelu", + mm_vision_select_feature="full", + mm_vision_select_layer=-2, + mm_vision_tower="google/siglip-so400m-patch14-384", + tie_word_embeddings=False, + tokenizer_model_max_length=16384, + tokenizer_padding_side="right", + unfreeze_mm_vision_tower=True, + **kwargs, + ): + self.ignore_index = ignore_index + self.image_aspect_ratio = image_aspect_ratio + self.image_grid_pinpoints = image_grid_pinpoints + self.image_token_index = image_token_index + self.mm_hidden_size = mm_hidden_size + self.mm_patch_merge_type = mm_patch_merge_type + self.mm_projector_type = mm_projector_type + self.mm_vision_select_feature = mm_vision_select_feature + self.mm_vision_select_layer = mm_vision_select_layer + self.mm_vision_tower = mm_vision_tower + self.tokenizer_model_max_length = tokenizer_model_max_length + self.tokenizer_padding_side = tokenizer_padding_side + self.unfreeze_mm_vision_tower = unfreeze_mm_vision_tower + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/vendor/mineru/mineru/model/vlm_hf_model/image_processing_mineru2.py b/vendor/mineru/mineru/model/vlm_hf_model/image_processing_mineru2.py new file mode 100644 index 0000000000000000000000000000000000000000..276c615416f9031dd0a46d1ba73b947f5afffb2b --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_hf_model/image_processing_mineru2.py @@ -0,0 +1,269 @@ +import ast +import math +import re +from functools import partial, reduce +from typing import Dict, Optional, Union + +import numpy as np +import torch +from PIL import Image +from transformers.image_processing_utils import ( + BaseImageProcessor, + BatchFeature, + get_size_dict, +) +from transformers.image_transforms import ( + convert_to_rgb, + normalize, + rescale, + resize, + to_channel_dimension_format, +) +from transformers.image_utils import ( + ChannelDimension, + PILImageResampling, + to_numpy_array, +) +from transformers.utils import TensorType + + +def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple: + original_width, original_height = original_size + best_fit = (0, 0) + max_effective_resolution = 0 + min_wasted_resolution = float("inf") + + for width, height in possible_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution + ): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (width, height) + + return best_fit + + +def divide_to_patches(image, patch_size): + patches = [] + width, height = image.size + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + box = (j, i, j + patch_size, i + patch_size) + patch = image.crop(box) + patches.append(patch) + return patches + + +def expand2square(pil_img, background_color): + width, height = pil_img.size + if width == height: + return pil_img + if pil_img.mode == "L": + pil_img = pil_img.convert("RGB") + if width > height: + result = Image.new(pil_img.mode, (width, width), background_color) + result.paste(pil_img, (0, (width - height) // 2)) + return result + else: + result = Image.new(pil_img.mode, (height, height), background_color) + result.paste(pil_img, ((height - width) // 2, 0)) + return result + + +def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): + if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints: + assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]" + matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints) + range_start = tuple(map(int, matches[0])) + range_end = tuple(map(int, matches[-1])) + grid_pinpoints = [ + (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1) + ] + grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints] + if type(grid_pinpoints) is list: + possible_resolutions = grid_pinpoints + else: + possible_resolutions = ast.literal_eval(grid_pinpoints) # type: ignore + width, height = select_best_resolution(image_size, possible_resolutions) + return width // patch_size, height // patch_size + + +# This functions is not used. +def resize_and_pad_image(image, target_resolution): + original_width, original_height = image.size + target_width, target_height = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + # Resize the image + resized_image = image.resize((new_width, new_height)) + + new_image = Image.new("RGB", (target_width, target_height), (0, 0, 0)) + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + new_image.paste(resized_image, (paste_x, paste_y)) + + return new_image + + +# DIFFERENT from sglang.srt.mm_utils.process_anyres_image +def process_anyres_image(image, processor, grid_pinpoints): + if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints: + patch_size = processor.crop_size["height"] + assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]" + matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints) + range_start = tuple(map(int, matches[0])) + range_end = tuple(map(int, matches[-1])) + grid_pinpoints = [ + (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1) + ] + grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints] + + if type(grid_pinpoints) is list: + possible_resolutions = grid_pinpoints + else: + possible_resolutions = ast.literal_eval(grid_pinpoints) # type: ignore + best_resolution = select_best_resolution(image.size, possible_resolutions) + + # image_padded = resize_and_pad_image(image, best_resolution) + image_padded = image.resize(best_resolution) + + patches = divide_to_patches(image_padded, processor.crop_size["height"]) + + image_original_resize = image.resize((processor.crop_size["height"], processor.crop_size["height"])) + + image_patches = [image_original_resize] + patches + image_patches = [processor.preprocess(image_patch, return_tensors="pt")["pixel_values"][0] for image_patch in image_patches] + return torch.stack(image_patches, dim=0) + + +def process_images(images, image_processor, model_cfg): + image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", "") + new_images = [] + if image_aspect_ratio == "pad": + for image in images: + image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean)) + image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0] + new_images.append(image) + elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio: + for image in images: + image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints) + new_images.append(image) + else: + return image_processor(images, return_tensors="pt")["pixel_values"] + if all(x.shape == new_images[0].shape for x in new_images): + new_images = torch.stack(new_images, dim=0) + return new_images + + +class Mineru2ImageProcessor(BaseImageProcessor): + model_input_names = ["pixel_values"] + + def __init__( + self, + image_mean=(0.5, 0.5, 0.5), + image_std=(0.5, 0.5, 0.5), + size=(384, 384), + crop_size: Optional[Dict[str, int]] = None, + resample=PILImageResampling.BICUBIC, + rescale_factor=1 / 255, + data_format=ChannelDimension.FIRST, + image_aspect_ratio: Optional[str] = None, + image_grid_pinpoints: Optional[list] = None, + **kwargs, + ) -> None: + super().__init__(**kwargs) + + crop_size = crop_size if crop_size is not None else {"height": 384, "width": 384} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.image_mean = image_mean + self.image_std = image_std + self.size = size + self.resample = resample + self.rescale_factor = rescale_factor + self.data_format = data_format + self.crop_size = crop_size + self.image_aspect_ratio = image_aspect_ratio + self.image_grid_pinpoints = image_grid_pinpoints + self.in_e2e_processing = False + + def _preprocess(self, images): + if isinstance(images, Image.Image): + images = [images] + else: + # to adapt video data + images = [to_numpy_array(image) for image in images] + assert isinstance(images, list) + + transforms = [ + convert_to_rgb, + to_numpy_array, + partial(resize, size=self.size, resample=self.resample, data_format=self.data_format), + partial(rescale, scale=self.rescale_factor, data_format=self.data_format), + partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format), + partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format), + ] + + images = reduce(lambda x, f: [*map(f, x)], transforms, images) + return {"pixel_values": images} + + def _preprocess_end_to_end(self, images): + image_aspect_ratio = self.image_aspect_ratio + image_grid_pinpoints = self.image_grid_pinpoints + assert image_aspect_ratio is not None + assert image_grid_pinpoints is not None + + pixel_values = [] + if image_aspect_ratio == "pad": + for image in images: + image = expand2square(image, tuple(int(x * 255) for x in self.image_mean)) + image = self._preprocess(image)["pixel_values"][0] + pixel_values.append(image) + elif image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio: + for image in images: + image = process_anyres_image(image, self, self.image_grid_pinpoints) + pixel_values.append(image.numpy()) + else: + pixel_values = self._preprocess(images)["pixel_values"] + + if isinstance(pixel_values, list) and all(x.shape == pixel_values[0].shape for x in pixel_values): + pixel_values = np.stack(pixel_values, axis=0) + + # CAUTION: here used (height, width). + image_sizes = [(image.height, image.width) for image in images] + assert len(pixel_values) == len(image_sizes) + + return {"pixel_values": pixel_values, "image_sizes": image_sizes} + + def preprocess( + self, + images, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ): + if self.image_aspect_ratio is None or self.in_e2e_processing: + data = self._preprocess(images) + else: + assert self.image_grid_pinpoints is not None + self.in_e2e_processing = True + try: + data = self._preprocess_end_to_end(images) + finally: + self.in_e2e_processing = False + + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/vendor/mineru/mineru/model/vlm_hf_model/modeling_mineru2.py b/vendor/mineru/mineru/model/vlm_hf_model/modeling_mineru2.py new file mode 100644 index 0000000000000000000000000000000000000000..a52d7c4485f213e483f3a0883a1b8a33e5737f71 --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_hf_model/modeling_mineru2.py @@ -0,0 +1,449 @@ +import math +import re +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +from transformers import ( + Qwen2ForCausalLM, + Qwen2Model, + SiglipVisionConfig, + SiglipVisionModel, +) +from transformers.generation.utils import GenerateOutput +from transformers.modeling_outputs import CausalLMOutputWithPast + +from .configuration_mineru2 import Mineru2QwenConfig +from .image_processing_mineru2 import Mineru2ImageProcessor, get_anyres_image_grid_shape + + +class SiglipVisionTower(nn.Module): + def __init__(self, vision_tower): + super().__init__() + + self.config = SiglipVisionConfig.from_pretrained(vision_tower) + assert isinstance(self.config, SiglipVisionConfig) + self.config.num_hidden_layers -= 1 # drop the last hidden layer + self.config.vision_use_head = False + + self.vision_tower = SiglipVisionModel(self.config) + self.vision_tower.requires_grad_(False) + + self.image_processor = Mineru2ImageProcessor() + + def forward(self, images): + if type(images) is list: + image_features = [] + for image in images: + image_forward_out = self.vision_tower( + image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True + ) + image_feature = image_forward_out.hidden_states[-1].to(image.dtype) + image_features.append(image_feature) + else: + image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True) + image_features = image_forward_outs.hidden_states[-1].to(images.dtype) + + return image_features + + @property + def dummy_feature(self): + return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype) + + @property + def dtype(self): + for p in self.vision_tower.parameters(): + return p.dtype + + @property + def device(self): + for p in self.vision_tower.parameters(): + return p.device + + @property + def hidden_size(self): + return self.config.hidden_size + + @property + def num_patches(self): + return (self.config.image_size // self.config.patch_size) ** 2 + + @property + def num_patches_per_side(self): + return self.config.image_size // self.config.patch_size + + @property + def image_size(self): + return self.config.image_size + + +def build_vision_tower(config: Mineru2QwenConfig): + vision_tower = getattr(config, "mm_vision_tower", getattr(config, "vision_tower", "")) + model_path = getattr(config, "_name_or_path", "") + if "siglip" in vision_tower.lower(): + if model_path: + return SiglipVisionTower(f"{model_path}/{vision_tower}") + else: + return SiglipVisionTower(vision_tower) + raise ValueError(f"Unknown vision tower: {vision_tower}") + + +def build_vision_projector(config: Mineru2QwenConfig): + projector_type = getattr(config, "mm_projector_type", "linear") + + if projector_type == "linear": + return nn.Linear(config.mm_hidden_size, config.hidden_size) + + mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type) + if mlp_gelu_match: + mlp_depth = int(mlp_gelu_match.group(1)) + modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] + for _ in range(1, mlp_depth): + modules.append(nn.GELU()) # type: ignore + modules.append(nn.Linear(config.hidden_size, config.hidden_size)) + return nn.Sequential(*modules) + + if projector_type == "identity": + return nn.Identity() + + raise ValueError(f"Unknown projector type: {projector_type}") + + +class Mineru2QwenModel(Qwen2Model): + config_class = Mineru2QwenConfig + + def __init__(self, config: Mineru2QwenConfig): + super(Mineru2QwenModel, self).__init__(config) + + self.vision_tower = build_vision_tower(config) + self.mm_projector = build_vision_projector(config) + + if "unpad" in getattr(config, "mm_patch_merge_type", ""): + self.image_newline = nn.Parameter(torch.empty(config.hidden_size, dtype=self.dtype)) + + +class Mineru2QwenForCausalLM(Qwen2ForCausalLM): + config_class = Mineru2QwenConfig + + def __init__(self, config: Mineru2QwenConfig): + super(Qwen2ForCausalLM, self).__init__(config) + config.rope_scaling = None + self.model = Mineru2QwenModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.ignore_index = config.ignore_index + self.image_token_index = config.image_token_index + + # Initialize weights and apply final processing + self.post_init() + + def get_model(self): + return self.model + + def encode_images(self, images: torch.Tensor): + image_features = self.get_model().vision_tower(images) + image_features = self.get_model().mm_projector(image_features) + return image_features + + def prepare_inputs_labels_for_multimodal( + self, input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes=None + ): + vision_tower = self.get_model().vision_tower + if vision_tower is None or images is None or input_ids.shape[1] == 1: + return input_ids, position_ids, attention_mask, past_key_values, None, labels + + if type(images) is list or images.ndim == 5: + if type(images) is list: + images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images] + concat_images = torch.cat([image for image in images], dim=0) + image_features = self.encode_images(concat_images) + split_sizes = [image.shape[0] for image in images] + image_features = torch.split(image_features, split_sizes, dim=0) + mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat") + image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square") + if mm_patch_merge_type == "flat": + image_features = [x.flatten(0, 1) for x in image_features] + elif mm_patch_merge_type.startswith("spatial"): + new_image_features = [] + for image_idx, image_feature in enumerate(image_features): + if image_feature.shape[0] > 1: + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + height = width = self.get_model().vision_tower.num_patches_per_side + assert height * width == base_image_feature.shape[0] + + if "anyres_max" in image_aspect_ratio: + matched_anyres_max_num_patches = re.match(r"square_anyres_max_(\d+)", image_aspect_ratio) + if matched_anyres_max_num_patches: + max_num_patches = int(matched_anyres_max_num_patches.group(1)) + + if image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio: + num_patch_width, num_patch_height = get_anyres_image_grid_shape( + image_sizes[image_idx], + self.config.image_grid_pinpoints, + self.get_model().vision_tower.config.image_size, + ) + image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) + else: + raise NotImplementedError + if ( + "unpad" in mm_patch_merge_type + and "anyres_max" in image_aspect_ratio + and matched_anyres_max_num_patches + ): + unit = image_feature.shape[2] + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + c, h, w = image_feature.shape + times = math.sqrt(h * w / (max_num_patches * unit**2)) + if times > 1.1: + image_feature = image_feature[None] + image_feature = nn.functional.interpolate( + image_feature, [int(h // times), int(w // times)], mode="bilinear" + )[0] + image_feature = torch.cat( + ( + image_feature, + self.model.image_newline[:, None, None] + .expand(*image_feature.shape[:-1], 1) + .to(image_feature.device), + ), + dim=-1, + ) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + elif "unpad" in mm_patch_merge_type: + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + image_feature = torch.cat( + ( + image_feature, + self.model.image_newline[:, None, None] + .expand(*image_feature.shape[:-1], 1) + .to(image_feature.device), + ), + dim=-1, + ) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + else: + image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() + image_feature = image_feature.flatten(0, 3) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + else: + image_feature = image_feature[0] + if "unpad" in mm_patch_merge_type: + image_feature = torch.cat( + (image_feature, self.model.image_newline[None].to(image_feature.device)), dim=0 + ) + new_image_features.append(image_feature) + image_features = new_image_features + else: + raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}") + else: + image_features = self.encode_images(images) + + _labels = labels + _position_ids = position_ids + _attention_mask = attention_mask + if attention_mask is None: + attention_mask = torch.ones_like(input_ids, dtype=torch.bool) + else: + attention_mask = attention_mask.bool() + if position_ids is None: + position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device) + if labels is None: + labels = torch.full_like(input_ids, self.ignore_index) + + # remove the padding using attention_mask -- FIXME + _input_ids = input_ids + input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)] + labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)] + + new_input_embeds = [] + new_labels = [] + cur_image_idx = 0 + for batch_idx, cur_input_ids in enumerate(input_ids): + num_images = (cur_input_ids == self.image_token_index).sum() + if num_images == 0: + cur_image_features = image_features[cur_image_idx] + cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids) + cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0) + new_input_embeds.append(cur_input_embeds) + new_labels.append(labels[batch_idx]) + cur_image_idx += 1 + continue + + image_token_indices = ( + [-1] + torch.where(cur_input_ids == self.image_token_index)[0].tolist() + [cur_input_ids.shape[0]] + ) + cur_input_ids_noim = [] + cur_labels = labels[batch_idx] + cur_labels_noim = [] + for i in range(len(image_token_indices) - 1): + cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]]) + cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]]) + split_sizes = [x.shape[0] for x in cur_labels_noim] + cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim)) + cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0) + cur_new_input_embeds = [] + cur_new_labels = [] + + for i in range(num_images + 1): + cur_new_input_embeds.append(cur_input_embeds_no_im[i]) + cur_new_labels.append(cur_labels_noim[i]) + if i < num_images: + cur_image_features = image_features[cur_image_idx] + cur_image_idx += 1 + cur_new_input_embeds.append(cur_image_features) + cur_new_labels.append( + torch.full( + (cur_image_features.shape[0],), self.ignore_index, device=cur_labels.device, dtype=cur_labels.dtype + ) + ) + + cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds] + + cur_new_input_embeds = torch.cat(cur_new_input_embeds) + cur_new_labels = torch.cat(cur_new_labels) + + new_input_embeds.append(cur_new_input_embeds) + new_labels.append(cur_new_labels) + + # Truncate sequences to max length as image embeddings can make the sequence longer + tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None) + if tokenizer_model_max_length is not None: + new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds] + new_labels = [x[:tokenizer_model_max_length] for x in new_labels] + + # Combine them + max_len = max(x.shape[0] for x in new_input_embeds) + batch_size = len(new_input_embeds) + + new_input_embeds_padded = [] + new_labels_padded = torch.full( + (batch_size, max_len), self.ignore_index, dtype=new_labels[0].dtype, device=new_labels[0].device + ) + attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device) + position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device) + + for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)): + cur_len = cur_new_embed.shape[0] + if getattr(self.config, "tokenizer_padding_side", "right") == "left": + new_input_embeds_padded.append( + torch.cat( + ( + torch.zeros( + (max_len - cur_len, cur_new_embed.shape[1]), + dtype=cur_new_embed.dtype, + device=cur_new_embed.device, + ), + cur_new_embed, + ), + dim=0, + ) + ) + if cur_len > 0: + new_labels_padded[i, -cur_len:] = cur_new_labels + attention_mask[i, -cur_len:] = True + position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device) + else: + new_input_embeds_padded.append( + torch.cat( + ( + cur_new_embed, + torch.zeros( + (max_len - cur_len, cur_new_embed.shape[1]), + dtype=cur_new_embed.dtype, + device=cur_new_embed.device, + ), + ), + dim=0, + ) + ) + if cur_len > 0: + new_labels_padded[i, :cur_len] = cur_new_labels + attention_mask[i, :cur_len] = True + position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device) + + new_input_embeds = torch.stack(new_input_embeds_padded, dim=0) + + if _labels is None: + new_labels = None + else: + new_labels = new_labels_padded + + if _attention_mask is None: + attention_mask = None + else: + attention_mask = attention_mask.to(dtype=_attention_mask.dtype) + + if _position_ids is None: + position_ids = None + + return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + images: Optional[torch.FloatTensor] = None, + image_sizes: Optional[List[List[int]]] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + + if inputs_embeds is None: + (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = ( + self.prepare_inputs_labels_for_multimodal( + input_ids, position_ids, attention_mask, past_key_values, labels, images, image_sizes + ) + ) + return super().forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + @torch.no_grad() + def generate( + self, + inputs: Optional[torch.Tensor] = None, + images: Optional[torch.Tensor] = None, + image_sizes: Optional[List[List[int]]] = None, + **kwargs, + ) -> Union[GenerateOutput, torch.LongTensor]: + position_ids = kwargs.pop("position_ids", None) + attention_mask = kwargs.pop("attention_mask", None) + if "inputs_embeds" in kwargs: + raise NotImplementedError("`inputs_embeds` is not supported") + + inputs, position_ids, attention_mask, _, inputs_embeds, _ = self.prepare_inputs_labels_for_multimodal( + inputs, position_ids, attention_mask, None, None, images, image_sizes=image_sizes + ) + + return super().generate(position_ids=position_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, **kwargs) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs): + images = kwargs.pop("images", None) + image_sizes = kwargs.pop("image_sizes", None) + inputs = super().prepare_inputs_for_generation( + input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs + ) + if images is not None: + inputs["images"] = images + if image_sizes is not None: + inputs["image_sizes"] = image_sizes + return inputs diff --git a/vendor/mineru/mineru/model/vlm_sglang_model/__init__.py b/vendor/mineru/mineru/model/vlm_sglang_model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..914822cb26364ceddd6fc9d3988c2615a63e05b1 --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_sglang_model/__init__.py @@ -0,0 +1,21 @@ +from sglang.srt.configs.model_config import multimodal_model_archs +from sglang.srt.models.registry import ModelRegistry + +try: + # sglang==0.4.5.post3 + from sglang.srt.managers.multimodal_processor import ( + PROCESSOR_MAPPING as PROCESSOR_MAPPING, + ) +except ImportError: + # sglang==0.4.4.post1 + from sglang.srt.managers.image_processor import ( + IMAGE_PROCESSOR_MAPPING as PROCESSOR_MAPPING, + ) + +from .. import vlm_hf_model as _ +from .image_processor import Mineru2ImageProcessor +from .model import Mineru2QwenForCausalLM + +ModelRegistry.models[Mineru2QwenForCausalLM.__name__] = Mineru2QwenForCausalLM +PROCESSOR_MAPPING[Mineru2QwenForCausalLM] = Mineru2ImageProcessor +multimodal_model_archs.append(Mineru2QwenForCausalLM.__name__) diff --git a/vendor/mineru/mineru/model/vlm_sglang_model/engine.py b/vendor/mineru/mineru/model/vlm_sglang_model/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..f028bba4de7ce77be127c202fff267fbcca0eac3 --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_sglang_model/engine.py @@ -0,0 +1,264 @@ +import asyncio +import time +from types import MethodType +from typing import AsyncIterator, Dict, Iterator, List, Optional, Union + +import fastapi +from sglang.srt.entrypoints.engine import Engine as _Engine +from sglang.srt.managers.io_struct import EmbeddingReqInput, GenerateReqInput +from sglang.srt.managers.tokenizer_manager import ( + TokenizerManager, + dataclass_to_string_truncated, + logger, +) +from sglang.srt.sampling.sampling_params import SamplingParams +from sglang.srt.server_args import ServerArgs + +from ...utils.run_async import run_async +from .logit_processor import Mineru2LogitProcessor + + +class BatchEngine(_Engine): + """ + The engine is patched to support batch multi-modal generate, and early image preprocessing. + """ + + def __init__(self, server_args: ServerArgs, **kwargs): + server_args.enable_custom_logit_processor = True + super().__init__(server_args=server_args, **kwargs) + _patch_tokenizer_manager(self.tokenizer_manager) + + def generate( + self, + # The input prompt. It can be a single prompt or a batch of prompts. + prompt: Optional[Union[List[str], str]] = None, + sampling_params: Optional[Union[List[Dict], Dict]] = None, + # The token ids for text; one can either specify text or input_ids. + input_ids: Optional[Union[List[List[int]], List[int]]] = None, + # The image input. It can be a file name, a url, or base64 encoded string. + # See also python/sglang/srt/utils.py:load_image. + image_data: Optional[Union[List[str], str]] = None, + return_logprob: Optional[Union[List[bool], bool]] = False, + logprob_start_len: Optional[Union[List[int], int]] = None, + top_logprobs_num: Optional[Union[List[int], int]] = None, + token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None, + lora_path: Optional[List[Optional[str]]] = None, + custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None, + return_hidden_states: bool = False, + stream: bool = False, + ) -> Union[Dict, Iterator[Dict]]: + """ + The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`. + Please refer to `GenerateReqInput` for the documentation. + """ + modalities_list = [] + + # EDIT + if isinstance(image_data, list): + for _ in range(len(image_data)): + modalities_list.append(["image"]) + elif image_data is not None: + modalities_list.append("image") + + # ADD + if custom_logit_processor is None: + custom_logit_processor = Mineru2LogitProcessor().to_str() + + obj = GenerateReqInput( + text=prompt, + input_ids=input_ids, + sampling_params=sampling_params, + image_data=image_data, + return_logprob=return_logprob, + logprob_start_len=logprob_start_len, + top_logprobs_num=top_logprobs_num, + token_ids_logprob=token_ids_logprob, + lora_path=lora_path, + modalities=modalities_list, + custom_logit_processor=custom_logit_processor, + return_hidden_states=return_hidden_states, + stream=stream, + ) + generator = _generate_request(self.tokenizer_manager, obj, None) + + if stream: + + def generator_wrapper(): + while True: + try: + chunk = run_async(generator.__anext__()) + yield chunk + except StopAsyncIteration: + break + + return generator_wrapper() + else: + ret = run_async(generator.__anext__()) + return ret + + async def async_generate( + self, + # The input prompt. It can be a single prompt or a batch of prompts. + prompt: Optional[Union[List[str], str]] = None, + sampling_params: Optional[Union[List[Dict], Dict]] = None, + # The token ids for text; one can either specify text or input_ids. + input_ids: Optional[Union[List[List[int]], List[int]]] = None, + # The image input. It can be a file name, a url, or base64 encoded string. + # See also python/sglang/srt/utils.py:load_image. + image_data: Optional[Union[List[str], str]] = None, + return_logprob: Optional[Union[List[bool], bool]] = False, + logprob_start_len: Optional[Union[List[int], int]] = None, + top_logprobs_num: Optional[Union[List[int], int]] = None, + token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None, + lora_path: Optional[List[Optional[str]]] = None, + custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None, + return_hidden_states: bool = False, + stream: bool = False, + ) -> Union[Dict, AsyncIterator[Dict], Iterator[Dict]]: + """ + The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`. + Please refer to `GenerateReqInput` for the documentation. + """ + modalities_list = [] + + # EDIT + if isinstance(image_data, list): + for _ in range(len(image_data)): + modalities_list.append(["image"]) + elif image_data is not None: + modalities_list.append("image") + + # ADD + if custom_logit_processor is None: + custom_logit_processor = Mineru2LogitProcessor().to_str() + + obj = GenerateReqInput( + text=prompt, + input_ids=input_ids, + sampling_params=sampling_params, + image_data=image_data, + return_logprob=return_logprob, + logprob_start_len=logprob_start_len, + top_logprobs_num=top_logprobs_num, + token_ids_logprob=token_ids_logprob, + lora_path=lora_path, + modalities=modalities_list, + custom_logit_processor=custom_logit_processor, + return_hidden_states=return_hidden_states, + stream=stream, + ) + generator = _generate_request(self.tokenizer_manager, obj, None) + + if stream is True: + return generator + else: + return await generator.__anext__() + + +def _auto_create_handle_loop(self: TokenizerManager): + """ + patch the original `auto_create_handle_loop()` method to reset `no_create_loop` + when the event loop changes. + """ + try: + curr_handle_loop = asyncio.get_running_loop() + except RuntimeError: + curr_handle_loop = None + + last_handle_loop = getattr(self, "_last_handle_loop", None) + if last_handle_loop != curr_handle_loop: + self.no_create_loop = False + setattr(self, "_last_handle_loop", curr_handle_loop) + return TokenizerManager.auto_create_handle_loop(self) + + +def _patch_tokenizer_manager(self: TokenizerManager): + self.auto_create_handle_loop = MethodType(_auto_create_handle_loop, self) + + +async def _one_request( + self: TokenizerManager, + obj: Union[GenerateReqInput, EmbeddingReqInput], + request: Optional[fastapi.Request], + created_time: Optional[float], +): + tokenized_obj = await self._tokenize_one_request(obj) + state = self._send_one_request(obj, tokenized_obj, created_time) + async for out in self._wait_one_response(obj, state, request): + yield out + + +async def _handle_batch_request( + self: TokenizerManager, + obj: Union[GenerateReqInput, EmbeddingReqInput], + request: Optional[fastapi.Request] = None, + created_time: Optional[float] = None, +): + batch_size = obj.batch_size + + generators = [] + rids = [] + + if getattr(obj, "parallel_sample_num", 1) != 1: + raise Exception("parallel_sample_num != 1 is not supported in this patched code.") + + # Send all requests + for i in range(batch_size): + tmp_obj = obj[i] + generators.append(_one_request(self, tmp_obj, request, created_time)) + rids.append(tmp_obj.rid) + + # Wait for all requests + is_stream = hasattr(obj, "stream") and obj.stream + if not is_stream: + outputs = await asyncio.gather(*(gen.__anext__() for gen in generators)) + yield outputs + else: + rid_to_index = {rid: i for i, rid in enumerate(rids)} + task_map = {asyncio.create_task(gen.__anext__()): gen for gen in generators} + while task_map: + done, _ = await asyncio.wait(task_map.keys(), return_when=asyncio.FIRST_COMPLETED) + + for task in done: + gen = task_map.pop(task) + try: + result = task.result() + result["index"] = rid_to_index[result["meta_info"]["id"]] + yield result + new_task = asyncio.create_task(gen.__anext__()) + task_map[new_task] = gen + except StopAsyncIteration: + pass + + +async def _generate_request( + self: TokenizerManager, + obj: Union[GenerateReqInput, EmbeddingReqInput], + request: Optional[fastapi.Request] = None, +): + created_time = time.time() + + self.auto_create_handle_loop() + + if isinstance(obj, EmbeddingReqInput) and self.is_generation: + raise ValueError( + "This model does not appear to be an embedding model by default. " + "Please add `--is-embedding` when launching the server or try another model." + ) + + obj.normalize_batch_and_arguments() + + if self.log_requests: + max_length, skip_names, _ = self.log_request_metadata + logger.info(f"Receive: obj={dataclass_to_string_truncated(obj, max_length, skip_names=skip_names)}") + + async with self.model_update_lock.reader_lock: + is_single = obj.is_single + if is_single: + tokenized_obj = await self._tokenize_one_request(obj) + state = self._send_one_request(obj, tokenized_obj, created_time) + async for response in self._wait_one_response(obj, state, request): + yield response + else: + async for response in _handle_batch_request(self, obj, request, created_time): + yield response diff --git a/vendor/mineru/mineru/model/vlm_sglang_model/image_processor.py b/vendor/mineru/mineru/model/vlm_sglang_model/image_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..400154f003a707dcdbc5ff3953b532d701809ab7 --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_sglang_model/image_processor.py @@ -0,0 +1,217 @@ +import ast +import asyncio +import re +from typing import List, Optional, Union + +import numpy as np + +try: + # sglang==0.4.5.post3 + from sglang.srt.managers.multimodal_processors.base_processor import ( + BaseMultimodalProcessor as BaseProcessor, + ) + + get_global_processor = None +except ImportError: + # sglang==0.4.4.post1 + from sglang.srt.managers.image_processors.base_image_processor import ( + BaseImageProcessor as BaseProcessor, + get_global_processor, + ) + +from sglang.srt.mm_utils import divide_to_patches, expand2square, select_best_resolution +from sglang.srt.utils import load_image, logger +from sglang.utils import get_exception_traceback + +from .model import Mineru2QwenForCausalLM + + +# image_best_res is only resized (not padded). +def process_anyres_image(image, processor, grid_pinpoints): + if isinstance(grid_pinpoints, str) and "x" in grid_pinpoints: + patch_size = processor.crop_size["height"] + assert patch_size in [224, 336, 384, 448, 512], "patch_size should be in [224, 336, 384, 448, 512]" + matches = re.findall(r"\((\d+)x(\d+)\)", grid_pinpoints) + range_start = tuple(map(int, matches[0])) + range_end = tuple(map(int, matches[-1])) + grid_pinpoints = [ + (i, j) for i in range(range_start[0], range_end[0] + 1) for j in range(range_start[1], range_end[1] + 1) + ] + grid_pinpoints = [[dim * patch_size for dim in pair] for pair in grid_pinpoints] + + if type(grid_pinpoints) is list: + possible_resolutions = grid_pinpoints + else: + possible_resolutions = ast.literal_eval(grid_pinpoints) + best_resolution = select_best_resolution(image.size, possible_resolutions) + + image_best_res = image.resize(best_resolution) # <<<<<<< Here changed + patches = divide_to_patches(image_best_res, processor.crop_size["height"]) + image_original_resize = image.resize((processor.crop_size["height"], processor.crop_size["height"])) + + image_patches = [image_original_resize] + patches + image_patches = [processor.preprocess(image_patch)["pixel_values"][0] for image_patch in image_patches] + return np.stack(image_patches, axis=0) + + +class Mineru2ImageProcessor(BaseProcessor): + def __init__(self, hf_config, server_args, _processor): + super().__init__(hf_config, server_args, _processor) + + @staticmethod + def _process_single_image_task( + image_data: Union[str, bytes], + image_aspect_ratio: Optional[str] = None, + image_grid_pinpoints: Optional[str] = None, + image_processor=None, + ): + if image_processor is None: + assert get_global_processor is not None + image_processor = get_global_processor().image_processor + + try: + image, image_size = load_image(image_data) + if image_size is not None: + # It is a video with multiple images + image_hash = hash(image_data) + pixel_values = image_processor(image)["pixel_values"] + pixel_values = np.stack(pixel_values, axis=0) + return pixel_values, image_hash, image_size + else: + # It is an image + image_hash = hash(image_data) + if image_aspect_ratio == "pad": + image = expand2square( + image, + tuple(int(x * 255) for x in image_processor.image_mean), + ) + pixel_values = image_processor(image.convert("RGB"))["pixel_values"][0] + elif image_aspect_ratio == "anyres" or (image_aspect_ratio is not None and "anyres_max" in image_aspect_ratio): + pixel_values = process_anyres_image(image, image_processor, image_grid_pinpoints) + else: + pixel_values = image_processor(image)["pixel_values"][0] + return pixel_values, image_hash, image.size + except Exception: + logger.error("Exception in TokenizerManager:\n" + get_exception_traceback()) + + async def _process_single_image(self, image_data: Union[bytes, str], aspect_ratio: str, grid_pinpoints: str): + if hasattr(self, "cpu_executor"): + executor = self.cpu_executor + else: + executor = self.executor + + if get_global_processor is not None: + image_processor = None # save ipc cost + else: + image_processor = self._processor.image_processor + + if executor is not None: + loop = asyncio.get_running_loop() + return await loop.run_in_executor( + executor, + Mineru2ImageProcessor._process_single_image_task, + image_data, + aspect_ratio, + grid_pinpoints, + image_processor, + ) + else: + return self._process_single_image_task( + image_data, + aspect_ratio, + grid_pinpoints, + image_processor, + ) + + # sglang==0.4.4.post1 + async def process_images_async( + self, + image_data: List[Union[str, bytes]], + input_text, + request_obj, + *args, + **kwargs, + ): + if not image_data: + return None + + modalities = request_obj.modalities or ["image"] + aspect_ratio = getattr(self.hf_config, "image_aspect_ratio", "") + + grid_pinpoints = ( + self.hf_config.image_grid_pinpoints + if hasattr(self.hf_config, "image_grid_pinpoints") and "anyres" in aspect_ratio + else None + ) + + if isinstance(image_data, str): + image_data = [image_data] + + if isinstance(image_data, list) and len(image_data) > 0: + if "multi-images" in modalities or "video" in modalities: + # Multiple images + aspect_ratio = "pad" # LLaVA OneVision Handling: more than one image --> interleaved image mode or video mode. We do not use anyres + pixel_values, image_hashes, image_sizes = [], [], [] + res = [] + for img_data in image_data: + res.append(self._process_single_image(img_data, aspect_ratio, grid_pinpoints)) + res = await asyncio.gather(*res) + for pixel_v, image_h, image_s in res: + pixel_values.append(pixel_v) + image_hashes.append(image_h) + image_sizes.append(image_s) + + if isinstance(pixel_values[0], np.ndarray): + pixel_values = np.stack(pixel_values, axis=0) + else: + # A single image + pixel_values, image_hash, image_size = await self._process_single_image( + image_data[0], aspect_ratio, grid_pinpoints + ) + image_hashes = [image_hash] + image_sizes = [image_size] + else: + raise ValueError(f"Invalid image data: {image_data}") + + return { + "pixel_values": pixel_values, + "image_hashes": image_hashes, + "image_sizes": image_sizes, + "modalities": request_obj.modalities or ["image"], + } + + # sglang==0.4.5.post3 + async def process_mm_data_async( + self, + image_data: List[Union[str, bytes]], + input_text, + request_obj, + *args, + **kwargs, + ): + from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem + + result = await self.process_images_async(image_data, input_text, request_obj, *args, **kwargs) + + if result is None: + return None + + modality = Modality.IMAGE + if isinstance(request_obj.modalities, list): + if request_obj.modalities[0] == "multi-images": + modality = Modality.MULTI_IMAGES + elif request_obj.modalities[0] == "video": + modality = Modality.VIDEO + + return { + "mm_items": [ + MultimodalDataItem( + pixel_values=result["pixel_values"], + image_sizes=result["image_sizes"], + modality=modality, + ) + ], + } + + +ImageProcessorMapping = {Mineru2QwenForCausalLM: Mineru2ImageProcessor} diff --git a/vendor/mineru/mineru/model/vlm_sglang_model/logit_processor.py b/vendor/mineru/mineru/model/vlm_sglang_model/logit_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..af29f341ca8a2632eee5bb883f21773d0bdb2ab9 --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_sglang_model/logit_processor.py @@ -0,0 +1,90 @@ +from typing import List + +from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor + + +class Mineru2LogitProcessor(CustomLogitProcessor): + """ + Stateless logit processor for Mineru2. + + (base-class: sglang.srt.sampling.custom_logit_processor.CustomLogitProcessor) + + This processor applies token-level constraints to prevent repetition during generation. + It supports two main constraints: + + - no_repeat_ngram_size (int): + Prevents repeating the same n-gram of specified size in the output. + Inspired by Hugging Face's NoRepeatNGramLogitsProcessor. + This implementation is slower due to its lack of specialized optimization. + + - no_repeat_token_count (int): + (Placeholder for future logic) + Intended to prevent repeating the same token multiple times. + Not yet implemented in this version. + """ + + def __init__(self) -> None: + super().__init__() + self._generated_ngrams = {} # Cache of generated n-grams by request ID + self._time = {} # Timestamp of the last update for each request + self._gen_step = 0 # Global generation step counter + + def __call__(self, logits, batch_info: List[dict]): + """ + Applies repetition constraints to the logits before sampling tokens. + + Args: + logits (FloatTensor): A tensor of shape (batch_size, vocab_size) containing raw token logits. + batch_info (List[dict]): A list of metadata dicts for each sample in the batch. Each dict must include: + - "__req__": Request object containing request ID and output_ids. + - "no_repeat_ngram_size": Size of n-gram to avoid repeating. + + Returns: + FloatTensor: The modified logits tensor with banned token logits set to -inf. + """ + from sglang.srt.managers.schedule_batch import Req + + self._gen_step += 1 # Update global generation step + + for idx, info in enumerate(batch_info): + if not isinstance(info, dict) or "__req__" not in info: + continue + + req: Req = info["__req__"] + rid = req.rid + output_ids = req.output_ids + ngram_size = info.get("no_repeat_ngram_size", 0) + + # Skip if there are not enough tokens to form an n-gram + if ngram_size <= 0 or len(output_ids) < ngram_size: + continue + + # Record the current step for cache cleanup tracking + self._time[rid] = self._gen_step + + # Initialize n-gram cache for this request if it doesn't exist + if rid not in self._generated_ngrams: + self._generated_ngrams[rid] = {} + + # Get the n-gram prefix (all but the last token) + prev_ngram = tuple(output_ids[-ngram_size:-1]) + last_token = output_ids[-1] + + # Store this n-gram occurrence + self._generated_ngrams[rid][prev_ngram] = self._generated_ngrams[rid].get(prev_ngram, []) + [last_token] + + # Get the next-token candidates to ban based on current prefix + current_prefix = tuple(output_ids[-ngram_size + 1 :]) + banned_tokens = self._generated_ngrams[rid].get(current_prefix, []) + + # Set the logits of banned tokens to negative infinity + for token in banned_tokens: + logits[idx][token] = -float("inf") + + # Clean up cache for expired requests + expired_rids = [rid for rid, last_used in self._time.items() if last_used < self._gen_step] + for rid in expired_rids: + self._generated_ngrams.pop(rid, None) + self._time.pop(rid, None) + + return logits diff --git a/vendor/mineru/mineru/model/vlm_sglang_model/model.py b/vendor/mineru/mineru/model/vlm_sglang_model/model.py new file mode 100644 index 0000000000000000000000000000000000000000..e06fec27cf3ef8f4367ee4a0353b21d6dc05e2d4 --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_sglang_model/model.py @@ -0,0 +1,452 @@ +import math +import re +from typing import Iterable, List, Optional, Tuple + +import numpy as np +import torch +from sglang.srt.layers.quantization.base_config import QuantizationConfig +from sglang.srt.mm_utils import ( + get_anyres_image_grid_shape, # unpad_image, unpad_image_shape +) +from sglang.srt.model_executor.forward_batch_info import ForwardBatch +from sglang.srt.model_loader.weight_utils import default_weight_loader +from sglang.srt.models.qwen2 import Qwen2ForCausalLM +from sglang.srt.utils import add_prefix +from torch import nn +from transformers import ( + CLIPVisionConfig, + CLIPVisionModel, + SiglipVisionConfig, + SiglipVisionModel, +) + +from ..vlm_hf_model.configuration_mineru2 import Mineru2QwenConfig +from ..vlm_hf_model.modeling_mineru2 import build_vision_projector +from ...utils.models_download_utils import auto_download_and_get_model_root_path + + +def flatten_nested_list(nested_list): + if isinstance(nested_list, list): + return [item for sublist in nested_list for item in flatten_nested_list(sublist)] + else: + return [nested_list] + + +def downgrade_modality(modality): + modality_str = str(modality) + if "MULTI_IMAGES" in modality_str: + return "multi-images" + if "IMAGE" in modality_str: + return "image" + if "VIDEO" in modality_str: + return "video" + if "AUDIO" in modality_str: + return "audio" + raise ValueError(f"Unexpected modality: {modality_str}") + + +class Mineru2QwenForCausalLM(nn.Module): + def __init__( + self, + config: Mineru2QwenConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.config = config + + if getattr(self.config, "projector_hidden_act", None) is None: + self.config.projector_hidden_act = "gelu" + if getattr(self.config, "image_token_index", None) is None: + self.config.image_token_index = 151646 + + # load vision tower + mm_vision_tower = self.config.mm_vision_tower + model_root_path = auto_download_and_get_model_root_path(mm_vision_tower, "vlm") + mm_vision_tower = f"{model_root_path}/{mm_vision_tower}" + + if "clip" in mm_vision_tower: + vision_config = CLIPVisionConfig.from_pretrained(mm_vision_tower) + self.vision_tower = CLIPVisionModel(vision_config) # type: ignore + elif "siglip" in mm_vision_tower: + vision_config = SiglipVisionConfig.from_pretrained(mm_vision_tower) + self.vision_tower = SiglipVisionModel(vision_config) # type: ignore + # Siglip needs all feature tokens + self.config.mm_vision_select_feature = "full" + else: + raise ValueError(f"Unexpected mm_vision_tower: {mm_vision_tower}") + + ### EDIT: change projector + # the name `projector` contains `proj` which is often used in attention layers, which can cause bugs in quantization. + self.multi_modal_mlp = build_vision_projector(config) + + self.language_model = Qwen2ForCausalLM( + config, + quant_config=quant_config, + prefix=add_prefix("language_model", prefix), + ) + + if "unpad" in getattr(config, "mm_patch_merge_type", ""): + self.language_model.model.image_newline = nn.Parameter(torch.empty(config.hidden_size)) + + language_model_device = next(self.language_model.parameters()).device + self.vision_tower = self.vision_tower.to(language_model_device) + self.vision_tower.eval() + + self.vision_feature_layer = self.config.mm_vision_select_layer + self.vision_feature_select_strategy = self.config.mm_vision_select_feature + self.image_size = self.vision_tower.config.image_size + self.patch_size = self.vision_tower.config.patch_size + + self.mm_patch_merge_type = getattr(self.config, "mm_patch_merge_type", "flat") + self.image_aspect_ratio = getattr(self.config, "image_aspect_ratio", "square") + self.image_grid_pinpoints = getattr(self.config, "image_grid_pinpoints", None) + + self.image_feature_len = int((self.image_size // self.patch_size) ** 2) + if self.vision_feature_select_strategy in ("patch", "full"): + pass + elif self.vision_feature_select_strategy == "cls_patch": + self.image_feature_len += 1 + else: + raise ValueError(f"Unexpected select feature: {self.select_feature}") + + def pad_input_ids(self, input_ids: List[int], image_inputs): + if hasattr(image_inputs, "mm_items"): # MultimodalInputs + # sglang==0.4.5.post3 + image_sizes = flatten_nested_list([item.image_sizes for item in image_inputs.mm_items]) + pad_values = [item.pad_value for item in image_inputs.mm_items] + else: # ImageInputs + # sglang==0.4.4.post1 + image_sizes = image_inputs.image_sizes + pad_values = image_inputs.pad_values + + # hardcode for spatial_unpad + anyres + # if image_inputs.modalities is not None and ( + # "multi-images" in image_inputs.modalities or "video" in image_inputs.modalities + # ): + # image_aspect_ratio = "pad" + # else: + # image_aspect_ratio = "anyres" + + offset_list = [] + image_inputs.image_pad_len = [] + for image_idx, image_s in enumerate(image_sizes): + if len(image_sizes) > 16: + # 2x2 pooling with stride 2 + new_image_feature_len = math.ceil(self.image_size / self.patch_size / 2) ** 2 + else: + new_image_feature_len = self.image_feature_len # multiimage + + height = width = self.num_patches_per_side + if "anyres" in self.config.image_aspect_ratio: + num_patch_width, num_patch_height = get_anyres_image_grid_shape( + image_s, + self.image_grid_pinpoints, + self.vision_tower.config.image_size, + ) + h = num_patch_height * height + w = num_patch_width * width + + ### EDIT: remove `unpad_image_shape` + # new_h, new_w = unpad_image_shape(h, w, image_s) + new_h, new_w = h, w + + if "anyres_max" in self.config.image_aspect_ratio: + matched_anyres_max_num_patches = re.match(r".*anyres_max_(\d+)", self.config.image_aspect_ratio) + if matched_anyres_max_num_patches: + max_num_patches = int(matched_anyres_max_num_patches.group(1)) + times = math.sqrt(new_h * new_w / (max_num_patches * self.image_feature_len)) + if times > 1.1: + new_h = int(new_h // times) + new_w = int(new_w // times) + new_image_feature_len += new_h * (new_w + 1) + + try: + offset = input_ids.index(self.config.image_token_index) + except ValueError: + offset = 0 + # old_len + pad_len - 1, because we need to remove image_token_id + input_ids = input_ids[:offset] + [pad_values[image_idx]] * new_image_feature_len + input_ids[offset + 1 :] + offset_list.append(offset) + image_inputs.image_pad_len.append(new_image_feature_len) + + image_inputs.image_offsets = offset_list + return input_ids + + def encode_images(self, pixel_values: torch.Tensor) -> torch.Tensor: + pixel_values = pixel_values.to(device=self.vision_tower.device, dtype=self.vision_tower.dtype) + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True) + # NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated. + + selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer] + if self.vision_feature_select_strategy in ["default", "patch"]: + selected_image_feature = selected_image_feature[:, 1:] + elif self.vision_feature_select_strategy == "full": + selected_image_feature = selected_image_feature + else: + raise ValueError(f"Unexpected select feature strategy: {self.vision_feature_select_strategy}") + + image_features = self.multi_modal_mlp(selected_image_feature) + return image_features + + @torch.no_grad() + def forward( + self, + input_ids: torch.LongTensor, + positions: torch.Tensor, + forward_batch: ForwardBatch, + ) -> torch.Tensor: + if hasattr(forward_batch, "mm_inputs"): + # sglang==0.4.5.post3 + image_inputs = forward_batch.mm_inputs + is_sglang_mm_inputs = True + else: + # sglang==0.4.4.post1 + image_inputs = forward_batch.image_inputs + is_sglang_mm_inputs = False + + if image_inputs is None: + image_inputs = [] + + if forward_batch.forward_mode.is_extend(): + # Clamp input ids. This is because the input_ids for the image tokens are + # filled with the hash values of the image for the prefix matching in the radix attention. + # There values are useless because their embeddings will be replaced by vision embeddings anyway. + input_ids.clamp_(min=0, max=self.config.vocab_size - 1) + + # Embed text inputs + input_embeds = self.language_model.model.embed_tokens(input_ids) + + # Got List[List[str]] extend it to List[str] + # The length of the List should be equal to batch size + modalities_list = [] + max_image_offset = [] + for im in image_inputs: + if im: + if hasattr(im, "mm_items"): + # sglang==0.4.5.post3 + modalities_list.extend([downgrade_modality(item.modality) for item in im.mm_items]) + elif im.modalities is not None: + # sglang==0.4.4.post1 + modalities_list.extend(im.modalities) + if im and im.image_offsets: + max_image_offset.append(np.max(np.array(im.image_offsets) + np.array(im.image_pad_len))) + else: + max_image_offset.append(-1) + + start_positions = positions[forward_batch.extend_start_loc].cpu().numpy() + need_vision = start_positions <= np.array(max_image_offset) + + if need_vision.any(): + bs = forward_batch.batch_size + + if is_sglang_mm_inputs: + # sglang==0.4.5.post3 + pixel_values = flatten_nested_list( + [[item.pixel_values for item in image_inputs[i].mm_items] for i in range(bs) if need_vision[i]] + ) # image_inputs[batch_idx].mm_items[item_idx].pixel_values is Tensor + image_sizes = [ + flatten_nested_list([item.image_sizes for item in image_inputs[i].mm_items]) + for i in range(bs) + if need_vision[i] + ] # image_inputs[batch_idx].mm_items[item_idx].image_sizes should be tuple, but is list of tuple for now. + else: + # sglang==0.4.4.post1 + pixel_values = [image_inputs[i].pixel_values for i in range(bs) if need_vision[i]] + image_sizes = [image_inputs[i].image_sizes for i in range(bs) if need_vision[i]] + + ########## Encode Image ######## + + if pixel_values[0].ndim == 4: + # llava-hd: BS, num_patch, C=3, H=336, W=336, num_patch obtained from process_images + np.concatenate(pixel_values, axis=0) + # ndim=4 + concat_images = torch.tensor( + np.concatenate(pixel_values, axis=0), + device=self.vision_tower.device, + ) + image_features = self.encode_images(concat_images) + split_sizes = [image.shape[0] for image in pixel_values] + image_features = torch.split(image_features, split_sizes, dim=0) + # hd image_features: BS, num_patch, 576, 4096 + else: + # normal pixel: BS, C=3, H=336, W=336 + pixel_values = torch.tensor(np.array(pixel_values), device=self.vision_tower.device) + image_features = self.encode_images(pixel_values) + # image_features: BS, 576, 4096 + + if self.mm_patch_merge_type.startswith("spatial"): + new_image_features = [] + height = width = self.num_patches_per_side + for image_idx, image_feature in enumerate(image_features): + if modalities_list[image_idx] == "image": + image_aspect_ratio = self.config.image_aspect_ratio # single image + elif modalities_list[image_idx] == "multi-images" or modalities_list[image_idx] == "video": + image_aspect_ratio = "pad" # multi image + # image_aspect_ratio = ( + # "anyres" if len(image_sizes[image_idx]) == 1 else "pad" + # ) + if ( + image_feature.shape[0] > 1 + and "anyres" in image_aspect_ratio + and modalities_list[image_idx] == "image" + ): + base_image_feature = image_feature[0] + image_feature = image_feature[1:] + assert height * width == base_image_feature.shape[0] + + if "anyres_max" in image_aspect_ratio: + matched_anyres_max_num_patches = re.match(r".*anyres_max_(\d+)", image_aspect_ratio) + if matched_anyres_max_num_patches: + max_num_patches = int(matched_anyres_max_num_patches.group(1)) + + if image_aspect_ratio == "anyres" or "anyres_max" in image_aspect_ratio: + vision_tower_image_size = self.image_size + try: + num_patch_width, num_patch_height = get_anyres_image_grid_shape( + image_sizes[image_idx][0], + self.config.image_grid_pinpoints, + vision_tower_image_size, + ) + except Exception as e: + print(f"Error: {e}") + num_patch_width, num_patch_height = 2, 2 + image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1) + else: + image_feature = image_feature.view(2, 2, height, width, -1) + + if "unpad" in self.mm_patch_merge_type: + unit = image_feature.shape[2] + image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous() + image_feature = image_feature.flatten(1, 2).flatten(2, 3) + + ### EDIT: remove `unpad_image` + # image_feature = unpad_image(image_feature, image_sizes[image_idx][0]) + + if "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches: + c, h, w = image_feature.shape + times = math.sqrt(h * w / (max_num_patches * unit**2)) + if times > 1.1: + image_feature = image_feature[None] + image_feature = nn.functional.interpolate( + image_feature, + [int(h // times), int(w // times)], + mode="bilinear", + )[0] + image_feature = torch.cat( + ( + image_feature, + self.language_model.model.image_newline[:, None, None].expand( + *image_feature.shape[:-1], 1 + ), + ), + dim=-1, + ) + image_feature = image_feature.flatten(1, 2).transpose(0, 1) + else: + image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous() + image_feature = image_feature.flatten(0, 3) + image_feature = torch.cat((base_image_feature, image_feature), dim=0) + image_feature = image_feature.unsqueeze(0) + else: + if modalities_list[image_idx] == "video": # video + # 2x2 pooling + num_of_frames = image_feature.shape[0] + image_feature = image_feature.view(num_of_frames, height, width, -1) + image_feature = image_feature.permute(0, 3, 1, 2).contiguous() # N, C, H, W + height, weight = image_feature.shape[2:] + scaled_shape = [ + math.ceil(height / 2), + math.ceil(weight / 2), + ] + image_feature = nn.functional.interpolate(image_feature, size=scaled_shape, mode="bilinear") + image_feature = image_feature.flatten(2).transpose(1, 2).contiguous() # N, C, H*W + if "unpad" in self.mm_patch_merge_type: + image_feature = torch.cat( + ( + image_feature, + # Expand to (bs, 1, hidden_dim) and concat at the end of the image tokens + self.language_model.model.image_newline[None, None].expand( + image_feature.shape[0], + 1, + image_feature.shape[-1], + ), + ), + dim=1, + ) + + new_image_features.append(image_feature) + image_features = new_image_features + + # Fill in the placeholder for the image + extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy() + extend_seq_lens = forward_batch.extend_seq_lens.cpu().numpy() + prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu + pt = 0 + for i in range(bs): + if not need_vision[i]: + continue + + start_idx = extend_start_loc_cpu[i] + seq_len = extend_seq_lens[i] + prefix_len = prefix_lens_cpu[i] + + # Multiple images + for image_idx, image_offset in enumerate(image_inputs[i].image_offsets): + if image_offset + image_inputs[i].image_pad_len[image_idx] <= prefix_len: + continue + if image_offset >= prefix_len + seq_len: + break + + tmp_image_feature = image_features[pt][image_idx] + pad_len = tmp_image_feature.shape[0] + + input_offset = image_offset - prefix_len + left_idx = start_idx + input_offset + right_idx = left_idx + pad_len + assert right_idx > start_idx + if input_offset < 0: + left_idx = start_idx + tmp_image_feature = tmp_image_feature[-input_offset:] + if right_idx > start_idx + seq_len: + tmp_image_feature = tmp_image_feature[: start_idx + seq_len - right_idx] + right_idx = start_idx + seq_len + try: + input_embeds[left_idx:right_idx] = tmp_image_feature + except RuntimeError as e: + print(f"RuntimeError in image encoding: {e}") + print(f"{input_embeds.shape=}, {tmp_image_feature.shape=}") + print(f"{start_idx=}, {image_offset=}, {prefix_len=}, {pad_len=}") + pt += 1 + + return self.language_model(input_ids, positions, forward_batch, input_embeds=input_embeds) + elif forward_batch.forward_mode.is_decode(): + return self.language_model(input_ids, positions, forward_batch) + else: + raise ValueError(f"Unexpected forward mode: {forward_batch.forward_mode}") + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + projector_weights = { + "model.mm_projector": "multi_modal_mlp", + "model.vision_tower.vision_tower": "vision_tower", + # Update the vision tower weights if we find them in the checkpoint (it may be finetuned). + "model.image_newline": "language_model.model.image_newline", + } + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + if "projector" in name or "vision_tower" in name or "image_newline" in name: + for weight_name, param_name in projector_weights.items(): + if weight_name in name: + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + else: + self.language_model.load_weights([(name, loaded_weight)]) + + @property + def num_patches_per_side(self): + return self.image_size // self.patch_size + + +EntryClass = [Mineru2QwenForCausalLM] diff --git a/vendor/mineru/mineru/model/vlm_sglang_model/server.py b/vendor/mineru/mineru/model/vlm_sglang_model/server.py new file mode 100644 index 0000000000000000000000000000000000000000..748091c4b1bf1fb4ae3750099273adbe19a0a18e --- /dev/null +++ b/vendor/mineru/mineru/model/vlm_sglang_model/server.py @@ -0,0 +1,75 @@ +import os +import sys + +from fastapi import Request +from sglang.srt.entrypoints.http_server import app, generate_request, launch_server +from sglang.srt.managers.io_struct import GenerateReqInput +from sglang.srt.server_args import prepare_server_args +from sglang.srt.utils import kill_process_tree +from sglang.srt.conversation import Conversation + +from mineru.utils.models_download_utils import auto_download_and_get_model_root_path +from .logit_processor import Mineru2LogitProcessor + +# mineru2.0的chat_template与chatml在换行上有微小区别 +def custom_get_prompt(self) -> str: + system_prompt = self.system_template.format(system_message=self.system_message) + if self.system_message == "": + ret = "" + else: + ret = system_prompt + self.sep + + for role, message in self.messages: + if message: + ret += role + "\n" + message + self.sep + else: + ret += role + "\n" + return ret + +_custom_logit_processor_str = Mineru2LogitProcessor().to_str() + +# remote the existing /generate route +for route in app.routes[:]: + if hasattr(route, "path") and getattr(route, "path") == "/generate": + app.routes.remove(route) + + +# add the custom /generate route +@app.api_route("/generate", methods=["POST", "PUT"]) +async def custom_generate_request(obj: GenerateReqInput, request: Request): + if obj.custom_logit_processor is None: + obj.custom_logit_processor = _custom_logit_processor_str + return await generate_request(obj, request) + + +def main(): + # 检查命令行参数中是否包含--model-path + args = sys.argv[1:] + has_model_path_arg = False + + for i, arg in enumerate(args): + if arg == "--model-path" or arg.startswith("--model-path="): + has_model_path_arg = True + break + + # 如果没有--model-path参数,在参数列表中添加它 + if not has_model_path_arg: + default_path = auto_download_and_get_model_root_path("/", "vlm") + args.extend(["--model-path", default_path]) + + server_args = prepare_server_args(args) + + if server_args.chat_template is None: + server_args.chat_template = "chatml" + Conversation.get_prompt = custom_get_prompt + + server_args.enable_custom_logit_processor = True + + try: + launch_server(server_args) + finally: + kill_process_tree(os.getpid(), include_parent=False) + + +if __name__ == "__main__": + main() diff --git a/vendor/mineru/mineru/resources/fasttext-langdetect/lid.176.ftz b/vendor/mineru/mineru/resources/fasttext-langdetect/lid.176.ftz new file mode 100644 index 0000000000000000000000000000000000000000..54ad911fadc26c1519c7043b6d596059b4116e66 --- /dev/null +++ b/vendor/mineru/mineru/resources/fasttext-langdetect/lid.176.ftz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f3472cfe8738a7b6099e8e999c3cbfae0dcd15696aac7d7738a8039db603e83 +size 938013 diff --git a/vendor/mineru/mineru/resources/header.html b/vendor/mineru/mineru/resources/header.html new file mode 100644 index 0000000000000000000000000000000000000000..f149df47cb132ca10176ccdf22bb5e2d65fda161 --- /dev/null +++ b/vendor/mineru/mineru/resources/header.html @@ -0,0 +1,129 @@ + + + + + +
+
+
+

+ MinerU 2: PDF Extraction Demo +

+
+
+ +

+ A one-stop, open-source, high-quality data extraction tool that supports converting PDF to Markdown and JSON.
+

+ + + + + +
+ + + \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/__init__.py b/vendor/mineru/mineru/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e17167ceda21a510d4486bf1711c9a72bf414db --- /dev/null +++ b/vendor/mineru/mineru/utils/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Opendatalab. All rights reserved. diff --git a/vendor/mineru/mineru/utils/block_pre_proc.py b/vendor/mineru/mineru/utils/block_pre_proc.py new file mode 100644 index 0000000000000000000000000000000000000000..881979ecb27a1af46c8d2b295904b4d543685caf --- /dev/null +++ b/vendor/mineru/mineru/utils/block_pre_proc.py @@ -0,0 +1,251 @@ +# Copyright (c) Opendatalab. All rights reserved. +from mineru.utils.boxbase import ( + calculate_iou, + calculate_overlap_area_in_bbox1_area_ratio, + calculate_vertical_projection_overlap_ratio, + get_minbox_if_overlap_by_ratio +) +from mineru.utils.enum_class import BlockType + + +def process_groups(groups, body_key, caption_key, footnote_key): + body_blocks = [] + caption_blocks = [] + footnote_blocks = [] + maybe_text_image_blocks = [] + for i, group in enumerate(groups): + if body_key == 'image_body' and len(group[caption_key]) == 0 and len(group[footnote_key]) == 0: + # 如果没有caption和footnote,则不需要将group_id添加到image_body中 + group[body_key]['group_id'] = i + maybe_text_image_blocks.append(group[body_key]) + continue + else: + group[body_key]['group_id'] = i + body_blocks.append(group[body_key]) + for caption_block in group[caption_key]: + caption_block['group_id'] = i + caption_blocks.append(caption_block) + for footnote_block in group[footnote_key]: + footnote_block['group_id'] = i + footnote_blocks.append(footnote_block) + return body_blocks, caption_blocks, footnote_blocks, maybe_text_image_blocks + + +def prepare_block_bboxes( + img_body_blocks, + img_caption_blocks, + img_footnote_blocks, + table_body_blocks, + table_caption_blocks, + table_footnote_blocks, + discarded_blocks, + text_blocks, + title_blocks, + interline_equation_blocks, + page_w, + page_h, +): + all_bboxes = [] + + add_bboxes(img_body_blocks, BlockType.IMAGE_BODY, all_bboxes) + add_bboxes(img_caption_blocks, BlockType.IMAGE_CAPTION, all_bboxes) + add_bboxes(img_footnote_blocks, BlockType.IMAGE_CAPTION, all_bboxes) + add_bboxes(table_body_blocks, BlockType.TABLE_BODY, all_bboxes) + add_bboxes(table_caption_blocks, BlockType.TABLE_CAPTION, all_bboxes) + add_bboxes(table_footnote_blocks, BlockType.TABLE_FOOTNOTE, all_bboxes) + add_bboxes(text_blocks, BlockType.TEXT, all_bboxes) + add_bboxes(title_blocks, BlockType.TITLE, all_bboxes) + add_bboxes(interline_equation_blocks, BlockType.INTERLINE_EQUATION, all_bboxes) + + """block嵌套问题解决""" + """文本框与标题框重叠,优先信任文本框""" + all_bboxes = fix_text_overlap_title_blocks(all_bboxes) + """任何框体与舍弃框重叠,优先信任舍弃框""" + all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks) + + # interline_equation 与title或text框冲突的情况,分两种情况处理 + """interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框""" + all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes) + """interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框""" + # 通过后续大框套小框逻辑删除 + + """discarded_blocks""" + all_discarded_blocks = [] + add_bboxes(discarded_blocks, BlockType.DISCARDED, all_discarded_blocks) + + """footnote识别:宽度超过1/3页面宽度的,高度超过10的,处于页面下半30%区域的""" + footnote_blocks = [] + for discarded in discarded_blocks: + x0, y0, x1, y1 = discarded['bbox'] + if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h * 0.7): + footnote_blocks.append([x0, y0, x1, y1]) + + """移除在footnote下面的任何框""" + need_remove_blocks = find_blocks_under_footnote(all_bboxes, footnote_blocks) + if len(need_remove_blocks) > 0: + for block in need_remove_blocks: + all_bboxes.remove(block) + all_discarded_blocks.append(block) + + """经过以上处理后,还存在大框套小框的情况,则删除小框""" + all_bboxes = remove_overlaps_min_blocks(all_bboxes) + all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) + + """粗排序后返回""" + all_bboxes.sort(key=lambda x: x[0]+x[1]) + return all_bboxes, all_discarded_blocks, footnote_blocks + + +def add_bboxes(blocks, block_type, bboxes): + for block in blocks: + x0, y0, x1, y1 = block['bbox'] + if block_type in [ + BlockType.IMAGE_BODY, + BlockType.IMAGE_CAPTION, + BlockType.IMAGE_FOOTNOTE, + BlockType.TABLE_BODY, + BlockType.TABLE_CAPTION, + BlockType.TABLE_FOOTNOTE, + ]: + bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score'], block['group_id']]) + else: + bboxes.append([x0, y0, x1, y1, None, None, None, block_type, None, None, None, None, block['score']]) + + +def fix_text_overlap_title_blocks(all_bboxes): + # 先提取所有text和title block + text_blocks = [] + for block in all_bboxes: + if block[7] == BlockType.TEXT: + text_blocks.append(block) + title_blocks = [] + for block in all_bboxes: + if block[7] == BlockType.TITLE: + title_blocks.append(block) + + need_remove = [] + + for text_block in text_blocks: + for title_block in title_blocks: + text_block_bbox = text_block[:4] + title_block_bbox = title_block[:4] + if calculate_iou(text_block_bbox, title_block_bbox) > 0.8: + if title_block not in need_remove: + need_remove.append(title_block) + + if len(need_remove) > 0: + for block in need_remove: + all_bboxes.remove(block) + + return all_bboxes + + +def remove_need_drop_blocks(all_bboxes, discarded_blocks): + need_remove = [] + for block in all_bboxes: + for discarded_block in discarded_blocks: + block_bbox = block[:4] + if ( + calculate_overlap_area_in_bbox1_area_ratio( + block_bbox, discarded_block['bbox'] + ) + > 0.6 + ): + if block not in need_remove: + need_remove.append(block) + break + + if len(need_remove) > 0: + for block in need_remove: + all_bboxes.remove(block) + return all_bboxes + + +def fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes): + # 先提取所有text和interline block + text_blocks = [] + for block in all_bboxes: + if block[7] == BlockType.TEXT: + text_blocks.append(block) + interline_equation_blocks = [] + for block in all_bboxes: + if block[7] == BlockType.INTERLINE_EQUATION: + interline_equation_blocks.append(block) + + need_remove = [] + + for interline_equation_block in interline_equation_blocks: + for text_block in text_blocks: + interline_equation_block_bbox = interline_equation_block[:4] + text_block_bbox = text_block[:4] + if calculate_iou(interline_equation_block_bbox, text_block_bbox) > 0.8: + if text_block not in need_remove: + need_remove.append(text_block) + + if len(need_remove) > 0: + for block in need_remove: + all_bboxes.remove(block) + + return all_bboxes + + +def find_blocks_under_footnote(all_bboxes, footnote_blocks): + need_remove_blocks = [] + for block in all_bboxes: + block_x0, block_y0, block_x1, block_y1 = block[:4] + for footnote_bbox in footnote_blocks: + footnote_x0, footnote_y0, footnote_x1, footnote_y1 = footnote_bbox + # 如果footnote的纵向投影覆盖了block的纵向投影的80%且block的y0大于等于footnote的y1 + if ( + block_y0 >= footnote_y1 + and calculate_vertical_projection_overlap_ratio( + (block_x0, block_y0, block_x1, block_y1), footnote_bbox + ) + >= 0.8 + ): + if block not in need_remove_blocks: + need_remove_blocks.append(block) + break + return need_remove_blocks + + +def remove_overlaps_min_blocks(all_bboxes): + # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。 + # 删除重叠blocks中较小的那些 + need_remove = [] + for i in range(len(all_bboxes)): + for j in range(i + 1, len(all_bboxes)): + block1 = all_bboxes[i] + block2 = all_bboxes[j] + block1_bbox = block1[:4] + block2_bbox = block2[:4] + overlap_box = get_minbox_if_overlap_by_ratio( + block1_bbox, block2_bbox, 0.8 + ) + if overlap_box is not None: + # 判断哪个区块的面积更小,移除较小的区块 + area1 = (block1[2] - block1[0]) * (block1[3] - block1[1]) + area2 = (block2[2] - block2[0]) * (block2[3] - block2[1]) + + if area1 <= area2: + block_to_remove = block1 + large_block = block2 + else: + block_to_remove = block2 + large_block = block1 + + if block_to_remove not in need_remove: + x1, y1, x2, y2 = large_block[:4] + sx1, sy1, sx2, sy2 = block_to_remove[:4] + x1 = min(x1, sx1) + y1 = min(y1, sy1) + x2 = max(x2, sx2) + y2 = max(y2, sy2) + large_block[:4] = [x1, y1, x2, y2] + need_remove.append(block_to_remove) + + for block in need_remove: + if block in all_bboxes: + all_bboxes.remove(block) + + return all_bboxes \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/block_sort.py b/vendor/mineru/mineru/utils/block_sort.py new file mode 100644 index 0000000000000000000000000000000000000000..aef05edea696013af454fd5f1bde1a66ab686252 --- /dev/null +++ b/vendor/mineru/mineru/utils/block_sort.py @@ -0,0 +1,339 @@ +# Copyright (c) Opendatalab. All rights reserved. +import copy +import os +import statistics +import warnings +from typing import List +import torch +from loguru import logger + +from mineru.utils.config_reader import get_device +from mineru.utils.enum_class import BlockType, ModelPath +from mineru.utils.models_download_utils import auto_download_and_get_model_root_path + + +def sort_blocks_by_bbox(blocks, page_w, page_h, footnote_blocks): + + """获取所有line并计算正文line的高度""" + line_height = get_line_height(blocks) + + """获取所有line并对line排序""" + sorted_bboxes = sort_lines_by_model(blocks, page_w, page_h, line_height, footnote_blocks) + + """根据line的中位数算block的序列关系""" + blocks = cal_block_index(blocks, sorted_bboxes) + + """将image和table的block还原回group形式参与后续流程""" + blocks = revert_group_blocks(blocks) + + """重排block""" + sorted_blocks = sorted(blocks, key=lambda b: b['index']) + + """block内重排(img和table的block内多个caption或footnote的排序)""" + for block in sorted_blocks: + if block['type'] in [BlockType.IMAGE, BlockType.TABLE]: + block['blocks'] = sorted(block['blocks'], key=lambda b: b['index']) + + return sorted_blocks + + +def get_line_height(blocks): + page_line_height_list = [] + for block in blocks: + if block['type'] in [ + BlockType.TEXT, BlockType.TITLE, + BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE, + BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE + ]: + for line in block['lines']: + bbox = line['bbox'] + page_line_height_list.append(int(bbox[3] - bbox[1])) + if len(page_line_height_list) > 0: + return statistics.median(page_line_height_list) + else: + return 10 + + +def sort_lines_by_model(fix_blocks, page_w, page_h, line_height, footnote_blocks): + page_line_list = [] + + def add_lines_to_block(b): + line_bboxes = insert_lines_into_block(b['bbox'], line_height, page_w, page_h) + b['lines'] = [] + for line_bbox in line_bboxes: + b['lines'].append({'bbox': line_bbox, 'spans': []}) + page_line_list.extend(line_bboxes) + + for block in fix_blocks: + if block['type'] in [ + BlockType.TEXT, BlockType.TITLE, + BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE, + BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE + ]: + if len(block['lines']) == 0: + add_lines_to_block(block) + elif block['type'] in [BlockType.TITLE] and len(block['lines']) == 1 and (block['bbox'][3] - block['bbox'][1]) > line_height * 2: + block['real_lines'] = copy.deepcopy(block['lines']) + add_lines_to_block(block) + else: + for line in block['lines']: + bbox = line['bbox'] + page_line_list.append(bbox) + elif block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]: + block['real_lines'] = copy.deepcopy(block['lines']) + add_lines_to_block(block) + + for block in footnote_blocks: + footnote_block = {'bbox': block[:4]} + add_lines_to_block(footnote_block) + + if len(page_line_list) > 200: # layoutreader最高支持512line + return None + + # 使用layoutreader排序 + x_scale = 1000.0 / page_w + y_scale = 1000.0 / page_h + boxes = [] + # logger.info(f"Scale: {x_scale}, {y_scale}, Boxes len: {len(page_line_list)}") + for left, top, right, bottom in page_line_list: + if left < 0: + logger.warning( + f'left < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}' + ) # noqa: E501 + left = 0 + if right > page_w: + logger.warning( + f'right > page_w, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}' + ) # noqa: E501 + right = page_w + if top < 0: + logger.warning( + f'top < 0, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}' + ) # noqa: E501 + top = 0 + if bottom > page_h: + logger.warning( + f'bottom > page_h, left: {left}, right: {right}, top: {top}, bottom: {bottom}, page_w: {page_w}, page_h: {page_h}' + ) # noqa: E501 + bottom = page_h + + left = round(left * x_scale) + top = round(top * y_scale) + right = round(right * x_scale) + bottom = round(bottom * y_scale) + assert ( + 1000 >= right >= left >= 0 and 1000 >= bottom >= top >= 0 + ), f'Invalid box. right: {right}, left: {left}, bottom: {bottom}, top: {top}' # noqa: E126, E121 + boxes.append([left, top, right, bottom]) + model_manager = ModelSingleton() + model = model_manager.get_model('layoutreader') + with torch.no_grad(): + orders = do_predict(boxes, model) + sorted_bboxes = [page_line_list[i] for i in orders] + + return sorted_bboxes + + +def insert_lines_into_block(block_bbox, line_height, page_w, page_h): + # block_bbox是一个元组(x0, y0, x1, y1),其中(x0, y0)是左下角坐标,(x1, y1)是右上角坐标 + x0, y0, x1, y1 = block_bbox + + block_height = y1 - y0 + block_weight = x1 - x0 + + # 如果block高度小于n行正文,则直接返回block的bbox + if line_height * 2 < block_height: + if ( + block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25 + ): # 可能是双列结构,可以切细点 + lines = int(block_height / line_height) + else: + # 如果block的宽度超过0.4页面宽度,则将block分成3行(是一种复杂布局,图不能切的太细) + if block_weight > page_w * 0.4: + lines = 3 + elif block_weight > page_w * 0.25: # (可能是三列结构,也切细点) + lines = int(block_height / line_height) + else: # 判断长宽比 + if block_height / block_weight > 1.2: # 细长的不分 + return [[x0, y0, x1, y1]] + else: # 不细长的还是分成两行 + lines = 2 + + line_height = (y1 - y0) / lines + + # 确定从哪个y位置开始绘制线条 + current_y = y0 + + # 用于存储线条的位置信息[(x0, y), ...] + lines_positions = [] + + for i in range(lines): + lines_positions.append([x0, current_y, x1, current_y + line_height]) + current_y += line_height + return lines_positions + + else: + return [[x0, y0, x1, y1]] + + +def model_init(model_name: str): + from transformers import LayoutLMv3ForTokenClassification + device_name = get_device() + bf_16_support = False + if device_name.startswith("cuda"): + bf_16_support = torch.cuda.is_bf16_supported() + elif device_name.startswith("mps"): + bf_16_support = True + + device = torch.device(device_name) + if model_name == 'layoutreader': + # 检测modelscope的缓存目录是否存在 + layoutreader_model_dir = os.path.join(auto_download_and_get_model_root_path(ModelPath.layout_reader), ModelPath.layout_reader) + if os.path.exists(layoutreader_model_dir): + model = LayoutLMv3ForTokenClassification.from_pretrained( + layoutreader_model_dir + ) + else: + logger.warning( + 'local layoutreader model not exists, use online model from huggingface' + ) + model = LayoutLMv3ForTokenClassification.from_pretrained( + 'hantian/layoutreader' + ) + if bf_16_support: + model.to(device).eval().bfloat16() + else: + model.to(device).eval() + else: + logger.error('model name not allow') + exit(1) + return model + + +class ModelSingleton: + _instance = None + _models = {} + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def get_model(self, model_name: str): + if model_name not in self._models: + self._models[model_name] = model_init(model_name=model_name) + return self._models[model_name] + + +def do_predict(boxes: List[List[int]], model) -> List[int]: + from mineru.model.reading_order.layout_reader import ( + boxes2inputs, parse_logits, prepare_inputs) + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=FutureWarning, module="transformers") + + inputs = boxes2inputs(boxes) + inputs = prepare_inputs(inputs, model) + logits = model(**inputs).logits.cpu().squeeze(0) + return parse_logits(logits, len(boxes)) + + +def cal_block_index(fix_blocks, sorted_bboxes): + + if sorted_bboxes is not None: + # 使用layoutreader排序 + for block in fix_blocks: + line_index_list = [] + if len(block['lines']) == 0: + block['index'] = sorted_bboxes.index(block['bbox']) + else: + for line in block['lines']: + line['index'] = sorted_bboxes.index(line['bbox']) + line_index_list.append(line['index']) + median_value = statistics.median(line_index_list) + block['index'] = median_value + + # 删除图表body block中的虚拟line信息, 并用real_lines信息回填 + if block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.TITLE, BlockType.INTERLINE_EQUATION]: + if 'real_lines' in block: + block['virtual_lines'] = copy.deepcopy(block['lines']) + block['lines'] = copy.deepcopy(block['real_lines']) + del block['real_lines'] + else: + # 使用xycut排序 + block_bboxes = [] + for block in fix_blocks: + # 如果block['bbox']任意值小于0,将其置为0 + block['bbox'] = [max(0, x) for x in block['bbox']] + block_bboxes.append(block['bbox']) + + # 删除图表body block中的虚拟line信息, 并用real_lines信息回填 + if block['type'] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.TITLE, BlockType.INTERLINE_EQUATION]: + if 'real_lines' in block: + block['virtual_lines'] = copy.deepcopy(block['lines']) + block['lines'] = copy.deepcopy(block['real_lines']) + del block['real_lines'] + + import numpy as np + from mineru.model.reading_order.xycut import recursive_xy_cut + + random_boxes = np.array(block_bboxes) + np.random.shuffle(random_boxes) + res = [] + recursive_xy_cut(np.asarray(random_boxes).astype(int), np.arange(len(block_bboxes)), res) + assert len(res) == len(block_bboxes) + sorted_boxes = random_boxes[np.array(res)].tolist() + + for i, block in enumerate(fix_blocks): + block['index'] = sorted_boxes.index(block['bbox']) + + # 生成line index + sorted_blocks = sorted(fix_blocks, key=lambda b: b['index']) + line_inedx = 1 + for block in sorted_blocks: + for line in block['lines']: + line['index'] = line_inedx + line_inedx += 1 + + return fix_blocks + + +def revert_group_blocks(blocks): + image_groups = {} + table_groups = {} + new_blocks = [] + for block in blocks: + if block['type'] in [BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE]: + group_id = block['group_id'] + if group_id not in image_groups: + image_groups[group_id] = [] + image_groups[group_id].append(block) + elif block['type'] in [BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE]: + group_id = block['group_id'] + if group_id not in table_groups: + table_groups[group_id] = [] + table_groups[group_id].append(block) + else: + new_blocks.append(block) + + for group_id, blocks in image_groups.items(): + new_blocks.append(process_block_list(blocks, BlockType.IMAGE_BODY, BlockType.IMAGE)) + + for group_id, blocks in table_groups.items(): + new_blocks.append(process_block_list(blocks, BlockType.TABLE_BODY, BlockType.TABLE)) + + return new_blocks + + +def process_block_list(blocks, body_type, block_type): + indices = [block['index'] for block in blocks] + median_index = statistics.median(indices) + + body_bbox = next((block['bbox'] for block in blocks if block.get('type') == body_type), []) + + return { + 'type': block_type, + 'bbox': body_bbox, + 'blocks': blocks, + 'index': median_index, + } \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/boxbase.py b/vendor/mineru/mineru/utils/boxbase.py new file mode 100644 index 0000000000000000000000000000000000000000..e249ecadd1d3c7b92c8bf13e946211e70abc921d --- /dev/null +++ b/vendor/mineru/mineru/utils/boxbase.py @@ -0,0 +1,203 @@ +import math + + +def is_in(box1, box2) -> bool: + """box1是否完全在box2里面.""" + x0_1, y0_1, x1_1, y1_1 = box1 + x0_2, y0_2, x1_2, y1_2 = box2 + + return ( + x0_1 >= x0_2 # box1的左边界不在box2的左边外 + and y0_1 >= y0_2 # box1的上边界不在box2的上边外 + and x1_1 <= x1_2 # box1的右边界不在box2的右边外 + and y1_1 <= y1_2 + ) # box1的下边界不在box2的下边外 + + +def bbox_relative_pos(bbox1, bbox2): + """判断两个矩形框的相对位置关系. + + Args: + bbox1: 一个四元组,表示第一个矩形框的左上角和右下角的坐标,格式为(x1, y1, x1b, y1b) + bbox2: 一个四元组,表示第二个矩形框的左上角和右下角的坐标,格式为(x2, y2, x2b, y2b) + + Returns: + 一个四元组,表示矩形框1相对于矩形框2的位置关系,格式为(left, right, bottom, top) + 其中,left表示矩形框1是否在矩形框2的左侧,right表示矩形框1是否在矩形框2的右侧, + bottom表示矩形框1是否在矩形框2的下方,top表示矩形框1是否在矩形框2的上方 + """ + x1, y1, x1b, y1b = bbox1 + x2, y2, x2b, y2b = bbox2 + + left = x2b < x1 + right = x1b < x2 + bottom = y2b < y1 + top = y1b < y2 + return left, right, bottom, top + + +def bbox_distance(bbox1, bbox2): + """计算两个矩形框的距离。 + + Args: + bbox1 (tuple): 第一个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。 + bbox2 (tuple): 第二个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。 + + Returns: + float: 矩形框之间的距离。 + """ + + def dist(point1, point2): + return math.sqrt((point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2) + + x1, y1, x1b, y1b = bbox1 + x2, y2, x2b, y2b = bbox2 + + left, right, bottom, top = bbox_relative_pos(bbox1, bbox2) + + if top and left: + return dist((x1, y1b), (x2b, y2)) + elif left and bottom: + return dist((x1, y1), (x2b, y2b)) + elif bottom and right: + return dist((x1b, y1), (x2, y2b)) + elif right and top: + return dist((x1b, y1b), (x2, y2)) + elif left: + return x1 - x2b + elif right: + return x2 - x1b + elif bottom: + return y1 - y2b + elif top: + return y2 - y1b + return 0.0 + + +def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio): + """通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例 + 如果比例大于ratio,则返回小的那个bbox, 否则返回None.""" + x1_min, y1_min, x1_max, y1_max = bbox1 + x2_min, y2_min, x2_max, y2_max = bbox2 + area1 = (x1_max - x1_min) * (y1_max - y1_min) + area2 = (x2_max - x2_min) * (y2_max - y2_min) + overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2) + if overlap_ratio > ratio: + if area1 <= area2: + return bbox1 + else: + return bbox2 + else: + return None + + +def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2): + """计算box1和box2的重叠面积占最小面积的box的比例.""" + # Determine the coordinates of the intersection rectangle + x_left = max(bbox1[0], bbox2[0]) + y_top = max(bbox1[1], bbox2[1]) + x_right = min(bbox1[2], bbox2[2]) + y_bottom = min(bbox1[3], bbox2[3]) + + if x_right < x_left or y_bottom < y_top: + return 0.0 + + # The area of overlap area + intersection_area = (x_right - x_left) * (y_bottom - y_top) + min_box_area = min([(bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]), + (bbox2[3] - bbox2[1]) * (bbox2[2] - bbox2[0])]) + if min_box_area == 0: + return 0 + else: + return intersection_area / min_box_area + + +def calculate_iou(bbox1, bbox2): + """计算两个边界框的交并比(IOU)。 + + Args: + bbox1 (list[float]): 第一个边界框的坐标,格式为 [x1, y1, x2, y2],其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。 + bbox2 (list[float]): 第二个边界框的坐标,格式与 `bbox1` 相同。 + + Returns: + float: 两个边界框的交并比(IOU),取值范围为 [0, 1]。 + """ + # Determine the coordinates of the intersection rectangle + x_left = max(bbox1[0], bbox2[0]) + y_top = max(bbox1[1], bbox2[1]) + x_right = min(bbox1[2], bbox2[2]) + y_bottom = min(bbox1[3], bbox2[3]) + + if x_right < x_left or y_bottom < y_top: + return 0.0 + + # The area of overlap area + intersection_area = (x_right - x_left) * (y_bottom - y_top) + + # The area of both rectangles + bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) + bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) + + if any([bbox1_area == 0, bbox2_area == 0]): + return 0 + + # Compute the intersection over union by taking the intersection area + # and dividing it by the sum of both areas minus the intersection area + iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area) + + return iou + + +def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2): + """计算box1和box2的重叠面积占bbox1的比例.""" + # Determine the coordinates of the intersection rectangle + x_left = max(bbox1[0], bbox2[0]) + y_top = max(bbox1[1], bbox2[1]) + x_right = min(bbox1[2], bbox2[2]) + y_bottom = min(bbox1[3], bbox2[3]) + + if x_right < x_left or y_bottom < y_top: + return 0.0 + + # The area of overlap area + intersection_area = (x_right - x_left) * (y_bottom - y_top) + bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) + if bbox1_area == 0: + return 0 + else: + return intersection_area / bbox1_area + + +def calculate_vertical_projection_overlap_ratio(block1, block2): + """ + Calculate the proportion of the x-axis covered by the vertical projection of two blocks. + + Args: + block1 (tuple): Coordinates of the first block (x0, y0, x1, y1). + block2 (tuple): Coordinates of the second block (x0, y0, x1, y1). + + Returns: + float: The proportion of the x-axis covered by the vertical projection of the two blocks. + """ + x0_1, _, x1_1, _ = block1 + x0_2, _, x1_2, _ = block2 + + # Calculate the intersection of the x-coordinates + x_left = max(x0_1, x0_2) + x_right = min(x1_1, x1_2) + + if x_right < x_left: + return 0.0 + + # Length of the intersection + intersection_length = x_right - x_left + + # Length of the x-axis projection of the first block + block1_length = x1_1 - x0_1 + + if block1_length == 0: + return 0.0 + + # Proportion of the x-axis covered by the intersection + # logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}") + return intersection_length / block1_length \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/cli_parser.py b/vendor/mineru/mineru/utils/cli_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..d1b981663594214e61f5489c094c3904d95db5b6 --- /dev/null +++ b/vendor/mineru/mineru/utils/cli_parser.py @@ -0,0 +1,38 @@ +import click + + +def arg_parse(ctx: 'click.Context') -> dict: + # 解析额外参数 + extra_kwargs = {} + i = 0 + while i < len(ctx.args): + arg = ctx.args[i] + if arg.startswith('--'): + param_name = arg[2:].replace('-', '_') # 转换参数名格式 + i += 1 + if i < len(ctx.args) and not ctx.args[i].startswith('--'): + # 参数有值 + try: + # 尝试转换为适当的类型 + if ctx.args[i].lower() == 'true': + extra_kwargs[param_name] = True + elif ctx.args[i].lower() == 'false': + extra_kwargs[param_name] = False + elif '.' in ctx.args[i]: + try: + extra_kwargs[param_name] = float(ctx.args[i]) + except ValueError: + extra_kwargs[param_name] = ctx.args[i] + else: + try: + extra_kwargs[param_name] = int(ctx.args[i]) + except ValueError: + extra_kwargs[param_name] = ctx.args[i] + except: + extra_kwargs[param_name] = ctx.args[i] + else: + # 布尔型标志参数 + extra_kwargs[param_name] = True + i -= 1 + i += 1 + return extra_kwargs \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/config_reader.py b/vendor/mineru/mineru/utils/config_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..f6d013eab30a3ab1b1268ad95b2432552f4bcb60 --- /dev/null +++ b/vendor/mineru/mineru/utils/config_reader.py @@ -0,0 +1,136 @@ +# Copyright (c) Opendatalab. All rights reserved. +import json +import os +from loguru import logger + +try: + import torch + import torch_npu +except ImportError: + pass + + +# 定义配置文件名常量 +CONFIG_FILE_NAME = os.getenv('MINERU_TOOLS_CONFIG_JSON', 'mineru.json') + + +def read_config(): + if os.path.isabs(CONFIG_FILE_NAME): + config_file = CONFIG_FILE_NAME + else: + home_dir = os.path.expanduser('~') + config_file = os.path.join(home_dir, CONFIG_FILE_NAME) + + if not os.path.exists(config_file): + # logger.warning(f'{config_file} not found, using default configuration') + return None + else: + with open(config_file, 'r', encoding='utf-8') as f: + config = json.load(f) + return config + + +def get_s3_config(bucket_name: str): + """~/magic-pdf.json 读出来.""" + config = read_config() + + bucket_info = config.get('bucket_info') + if bucket_name not in bucket_info: + access_key, secret_key, storage_endpoint = bucket_info['[default]'] + else: + access_key, secret_key, storage_endpoint = bucket_info[bucket_name] + + if access_key is None or secret_key is None or storage_endpoint is None: + raise Exception(f'ak, sk or endpoint not found in {CONFIG_FILE_NAME}') + + # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}") + + return access_key, secret_key, storage_endpoint + + +def get_s3_config_dict(path: str): + access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path)) + return {'ak': access_key, 'sk': secret_key, 'endpoint': storage_endpoint} + + +def get_bucket_name(path): + bucket, key = parse_bucket_key(path) + return bucket + + +def parse_bucket_key(s3_full_path: str): + """ + 输入 s3://bucket/path/to/my/file.txt + 输出 bucket, path/to/my/file.txt + """ + s3_full_path = s3_full_path.strip() + if s3_full_path.startswith("s3://"): + s3_full_path = s3_full_path[5:] + if s3_full_path.startswith("/"): + s3_full_path = s3_full_path[1:] + bucket, key = s3_full_path.split("/", 1) + return bucket, key + + +def get_device(): + device_mode = os.getenv('MINERU_DEVICE_MODE', None) + if device_mode is not None: + return device_mode + else: + if torch.cuda.is_available(): + return "cuda" + elif torch.backends.mps.is_available(): + return "mps" + else: + try: + if torch_npu.npu.is_available(): + return "npu" + except Exception as e: + pass + return "cpu" + + +def get_formula_enable(formula_enable): + formula_enable_env = os.getenv('MINERU_FORMULA_ENABLE') + formula_enable = formula_enable if formula_enable_env is None else formula_enable_env.lower() == 'true' + return formula_enable + + +def get_table_enable(table_enable): + table_enable_env = os.getenv('MINERU_TABLE_ENABLE') + table_enable = table_enable if table_enable_env is None else table_enable_env.lower() == 'true' + return table_enable + + +def get_latex_delimiter_config(): + config = read_config() + if config is None: + return None + latex_delimiter_config = config.get('latex-delimiter-config', None) + if latex_delimiter_config is None: + # logger.warning(f"'latex-delimiter-config' not found in {CONFIG_FILE_NAME}, use 'None' as default") + return None + else: + return latex_delimiter_config + + +def get_llm_aided_config(): + config = read_config() + if config is None: + return None + llm_aided_config = config.get('llm-aided-config', None) + if llm_aided_config is None: + # logger.warning(f"'llm-aided-config' not found in {CONFIG_FILE_NAME}, use 'None' as default") + return None + else: + return llm_aided_config + + +def get_local_models_dir(): + config = read_config() + if config is None: + return None + models_dir = config.get('models-dir') + if models_dir is None: + logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use None as default") + return models_dir \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/cut_image.py b/vendor/mineru/mineru/utils/cut_image.py new file mode 100644 index 0000000000000000000000000000000000000000..20e02aeed1157bb2b5c44604d5c7497da54dad62 --- /dev/null +++ b/vendor/mineru/mineru/utils/cut_image.py @@ -0,0 +1,27 @@ +from loguru import logger + +from .pdf_image_tools import cut_image + + +def cut_image_and_table(span, page_pil_img, page_img_md5, page_id, image_writer, scale=2): + + def return_path(path_type): + return f"{path_type}/{page_img_md5}" + + span_type = span["type"] + + if not check_img_bbox(span["bbox"]) or not image_writer: + span["image_path"] = "" + else: + span["image_path"] = cut_image( + span["bbox"], page_id, page_pil_img, return_path=return_path(span_type), image_writer=image_writer, scale=scale + ) + + return span + + +def check_img_bbox(bbox) -> bool: + if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]): + logger.warning(f"image_bboxes: 错误的box, {bbox}") + return False + return True diff --git a/vendor/mineru/mineru/utils/draw_bbox.py b/vendor/mineru/mineru/utils/draw_bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..2109bea0f22f3eca53ef2b2afb44977f4d552c94 --- /dev/null +++ b/vendor/mineru/mineru/utils/draw_bbox.py @@ -0,0 +1,329 @@ +import json +from io import BytesIO + +from loguru import logger +from pypdf import PdfReader, PdfWriter +from reportlab.pdfgen import canvas + +from .enum_class import BlockType, ContentType + + +def draw_bbox_without_number(i, bbox_list, page, c, rgb_config, fill_config): + new_rgb = [float(color) / 255 for color in rgb_config] + page_data = bbox_list[i] + page_width, page_height = page.cropbox[2], page.cropbox[3] + + for bbox in page_data: + width = bbox[2] - bbox[0] + height = bbox[3] - bbox[1] + rect = [bbox[0], page_height - bbox[3], width, height] # Define the rectangle + + if fill_config: # filled rectangle + c.setFillColorRGB(new_rgb[0], new_rgb[1], new_rgb[2], 0.3) + c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1) + else: # bounding box + c.setStrokeColorRGB(new_rgb[0], new_rgb[1], new_rgb[2]) + c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0) + return c + + +def draw_bbox_with_number(i, bbox_list, page, c, rgb_config, fill_config, draw_bbox=True): + new_rgb = [float(color) / 255 for color in rgb_config] + page_data = bbox_list[i] + # 强制转换为 float + page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3]) + + for j, bbox in enumerate(page_data): + # 确保bbox的每个元素都是float + x0, y0, x1, y1 = map(float, bbox) + width = x1 - x0 + height = y1 - y0 + rect = [x0, page_height - y1, width, height] + if draw_bbox: + if fill_config: + c.setFillColorRGB(*new_rgb, 0.3) + c.rect(rect[0], rect[1], rect[2], rect[3], stroke=0, fill=1) + else: + c.setStrokeColorRGB(*new_rgb) + c.rect(rect[0], rect[1], rect[2], rect[3], stroke=1, fill=0) + c.setFillColorRGB(*new_rgb, 1.0) + c.setFontSize(size=10) + # 这里也要用float + c.drawString(x1 + 2, page_height - y0 - 10, str(j + 1)) + + return c + + +def draw_layout_bbox(pdf_info, pdf_bytes, out_path, filename): + dropped_bbox_list = [] + tables_list, tables_body_list = [], [] + tables_caption_list, tables_footnote_list = [], [] + imgs_list, imgs_body_list, imgs_caption_list = [], [], [] + imgs_footnote_list = [] + titles_list = [] + texts_list = [] + interequations_list = [] + lists_list = [] + indexs_list = [] + for page in pdf_info: + page_dropped_list = [] + tables, tables_body, tables_caption, tables_footnote = [], [], [], [] + imgs, imgs_body, imgs_caption, imgs_footnote = [], [], [], [] + titles = [] + texts = [] + interequations = [] + lists = [] + indices = [] + + for dropped_bbox in page['discarded_blocks']: + page_dropped_list.append(dropped_bbox['bbox']) + dropped_bbox_list.append(page_dropped_list) + for block in page["para_blocks"]: + bbox = block["bbox"] + if block["type"] == BlockType.TABLE: + tables.append(bbox) + for nested_block in block["blocks"]: + bbox = nested_block["bbox"] + if nested_block["type"] == BlockType.TABLE_BODY: + tables_body.append(bbox) + elif nested_block["type"] == BlockType.TABLE_CAPTION: + tables_caption.append(bbox) + elif nested_block["type"] == BlockType.TABLE_FOOTNOTE: + tables_footnote.append(bbox) + elif block["type"] == BlockType.IMAGE: + imgs.append(bbox) + for nested_block in block["blocks"]: + bbox = nested_block["bbox"] + if nested_block["type"] == BlockType.IMAGE_BODY: + imgs_body.append(bbox) + elif nested_block["type"] == BlockType.IMAGE_CAPTION: + imgs_caption.append(bbox) + elif nested_block["type"] == BlockType.IMAGE_FOOTNOTE: + imgs_footnote.append(bbox) + elif block["type"] == BlockType.TITLE: + titles.append(bbox) + elif block["type"] == BlockType.TEXT: + texts.append(bbox) + elif block["type"] == BlockType.INTERLINE_EQUATION: + interequations.append(bbox) + elif block["type"] == BlockType.LIST: + lists.append(bbox) + elif block["type"] == BlockType.INDEX: + indices.append(bbox) + + tables_list.append(tables) + tables_body_list.append(tables_body) + tables_caption_list.append(tables_caption) + tables_footnote_list.append(tables_footnote) + imgs_list.append(imgs) + imgs_body_list.append(imgs_body) + imgs_caption_list.append(imgs_caption) + imgs_footnote_list.append(imgs_footnote) + titles_list.append(titles) + texts_list.append(texts) + interequations_list.append(interequations) + lists_list.append(lists) + indexs_list.append(indices) + + layout_bbox_list = [] + + table_type_order = {"table_caption": 1, "table_body": 2, "table_footnote": 3} + for page in pdf_info: + page_block_list = [] + for block in page["para_blocks"]: + if block["type"] in [ + BlockType.TEXT, + BlockType.TITLE, + BlockType.INTERLINE_EQUATION, + BlockType.LIST, + BlockType.INDEX, + ]: + bbox = block["bbox"] + page_block_list.append(bbox) + elif block["type"] in [BlockType.IMAGE]: + for sub_block in block["blocks"]: + bbox = sub_block["bbox"] + page_block_list.append(bbox) + elif block["type"] in [BlockType.TABLE]: + sorted_blocks = sorted(block["blocks"], key=lambda x: table_type_order[x["type"]]) + for sub_block in sorted_blocks: + bbox = sub_block["bbox"] + page_block_list.append(bbox) + + layout_bbox_list.append(page_block_list) + + pdf_bytes_io = BytesIO(pdf_bytes) + pdf_docs = PdfReader(pdf_bytes_io) + output_pdf = PdfWriter() + + for i, page in enumerate(pdf_docs.pages): + # 获取原始页面尺寸 + page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3]) + custom_page_size = (page_width, page_height) + + packet = BytesIO() + # 使用原始PDF的尺寸创建canvas + c = canvas.Canvas(packet, pagesize=custom_page_size) + + c = draw_bbox_without_number(i, dropped_bbox_list, page, c, [158, 158, 158], True) + c = draw_bbox_without_number(i, tables_body_list, page, c, [204, 204, 0], True) + c = draw_bbox_without_number(i, tables_caption_list, page, c, [255, 255, 102], True) + c = draw_bbox_without_number(i, tables_footnote_list, page, c, [229, 255, 204], True) + c = draw_bbox_without_number(i, imgs_body_list, page, c, [153, 255, 51], True) + c = draw_bbox_without_number(i, imgs_caption_list, page, c, [102, 178, 255], True) + c = draw_bbox_without_number(i, imgs_footnote_list, page, c, [255, 178, 102], True) + c = draw_bbox_without_number(i, titles_list, page, c, [102, 102, 255], True) + c = draw_bbox_without_number(i, texts_list, page, c, [153, 0, 76], True) + c = draw_bbox_without_number(i, interequations_list, page, c, [0, 255, 0], True) + c = draw_bbox_without_number(i, lists_list, page, c, [40, 169, 92], True) + c = draw_bbox_without_number(i, indexs_list, page, c, [40, 169, 92], True) + c = draw_bbox_with_number(i, layout_bbox_list, page, c, [255, 0, 0], False, draw_bbox=False) + + c.save() + packet.seek(0) + overlay_pdf = PdfReader(packet) + + # 添加检查确保overlay_pdf.pages不为空 + if len(overlay_pdf.pages) > 0: + page.merge_page(overlay_pdf.pages[0]) + else: + # 记录日志并继续处理下一个页面 + # logger.warning(f"layout.pdf: 第{i + 1}页未能生成有效的overlay PDF") + pass + + output_pdf.add_page(page) + + # 保存结果 + with open(f"{out_path}/{filename}", "wb") as f: + output_pdf.write(f) + + +def draw_span_bbox(pdf_info, pdf_bytes, out_path, filename): + text_list = [] + inline_equation_list = [] + interline_equation_list = [] + image_list = [] + table_list = [] + dropped_list = [] + next_page_text_list = [] + next_page_inline_equation_list = [] + + def get_span_info(span): + if span['type'] == ContentType.TEXT: + if span.get('cross_page', False): + next_page_text_list.append(span['bbox']) + else: + page_text_list.append(span['bbox']) + elif span['type'] == ContentType.INLINE_EQUATION: + if span.get('cross_page', False): + next_page_inline_equation_list.append(span['bbox']) + else: + page_inline_equation_list.append(span['bbox']) + elif span['type'] == ContentType.INTERLINE_EQUATION: + page_interline_equation_list.append(span['bbox']) + elif span['type'] == ContentType.IMAGE: + page_image_list.append(span['bbox']) + elif span['type'] == ContentType.TABLE: + page_table_list.append(span['bbox']) + + for page in pdf_info: + page_text_list = [] + page_inline_equation_list = [] + page_interline_equation_list = [] + page_image_list = [] + page_table_list = [] + page_dropped_list = [] + + # 将跨页的span放到移动到下一页的列表中 + if len(next_page_text_list) > 0: + page_text_list.extend(next_page_text_list) + next_page_text_list.clear() + if len(next_page_inline_equation_list) > 0: + page_inline_equation_list.extend(next_page_inline_equation_list) + next_page_inline_equation_list.clear() + + # 构造dropped_list + for block in page['discarded_blocks']: + if block['type'] == BlockType.DISCARDED: + for line in block['lines']: + for span in line['spans']: + page_dropped_list.append(span['bbox']) + dropped_list.append(page_dropped_list) + # 构造其余useful_list + # for block in page['para_blocks']: # span直接用分段合并前的结果就可以 + for block in page['preproc_blocks']: + if block['type'] in [ + BlockType.TEXT, + BlockType.TITLE, + BlockType.INTERLINE_EQUATION, + BlockType.LIST, + BlockType.INDEX, + ]: + for line in block['lines']: + for span in line['spans']: + get_span_info(span) + elif block['type'] in [BlockType.IMAGE, BlockType.TABLE]: + for sub_block in block['blocks']: + for line in sub_block['lines']: + for span in line['spans']: + get_span_info(span) + text_list.append(page_text_list) + inline_equation_list.append(page_inline_equation_list) + interline_equation_list.append(page_interline_equation_list) + image_list.append(page_image_list) + table_list.append(page_table_list) + + pdf_bytes_io = BytesIO(pdf_bytes) + pdf_docs = PdfReader(pdf_bytes_io) + output_pdf = PdfWriter() + + for i, page in enumerate(pdf_docs.pages): + # 获取原始页面尺寸 + page_width, page_height = float(page.cropbox[2]), float(page.cropbox[3]) + custom_page_size = (page_width, page_height) + + packet = BytesIO() + # 使用原始PDF的尺寸创建canvas + c = canvas.Canvas(packet, pagesize=custom_page_size) + + # 获取当前页面的数据 + draw_bbox_without_number(i, text_list, page, c,[255, 0, 0], False) + draw_bbox_without_number(i, inline_equation_list, page, c, [0, 255, 0], False) + draw_bbox_without_number(i, interline_equation_list, page, c, [0, 0, 255], False) + draw_bbox_without_number(i, image_list, page, c, [255, 204, 0], False) + draw_bbox_without_number(i, table_list, page, c, [204, 0, 255], False) + draw_bbox_without_number(i, dropped_list, page, c, [158, 158, 158], False) + + c.save() + packet.seek(0) + overlay_pdf = PdfReader(packet) + + # 添加检查确保overlay_pdf.pages不为空 + if len(overlay_pdf.pages) > 0: + page.merge_page(overlay_pdf.pages[0]) + else: + # 记录日志并继续处理下一个页面 + # logger.warning(f"span.pdf: 第{i + 1}页未能生成有效的overlay PDF") + pass + + output_pdf.add_page(page) + + # Save the PDF + with open(f"{out_path}/{filename}", "wb") as f: + output_pdf.write(f) + + +if __name__ == "__main__": + # 读取PDF文件 + pdf_path = "examples/demo1.pdf" + with open(pdf_path, "rb") as f: + pdf_bytes = f.read() + + # 从json文件读取pdf_info + + json_path = "examples/demo1_1746005777.0863056_middle.json" + with open(json_path, "r", encoding="utf-8") as f: + pdf_ann = json.load(f) + pdf_info = pdf_ann["pdf_info"] + # 调用可视化函数,输出到examples目录 + draw_layout_bbox(pdf_info, pdf_bytes, "examples", "output_with_layout.pdf") diff --git a/vendor/mineru/mineru/utils/enum_class.py b/vendor/mineru/mineru/utils/enum_class.py new file mode 100644 index 0000000000000000000000000000000000000000..49164e329ce735e70146fac66dd2dd2e3fd0db48 --- /dev/null +++ b/vendor/mineru/mineru/utils/enum_class.py @@ -0,0 +1,66 @@ +class BlockType: + IMAGE = 'image' + TABLE = 'table' + IMAGE_BODY = 'image_body' + TABLE_BODY = 'table_body' + IMAGE_CAPTION = 'image_caption' + TABLE_CAPTION = 'table_caption' + IMAGE_FOOTNOTE = 'image_footnote' + TABLE_FOOTNOTE = 'table_footnote' + TEXT = 'text' + TITLE = 'title' + INTERLINE_EQUATION = 'interline_equation' + LIST = 'list' + INDEX = 'index' + DISCARDED = 'discarded' + + +class ContentType: + IMAGE = 'image' + TABLE = 'table' + TEXT = 'text' + INTERLINE_EQUATION = 'interline_equation' + INLINE_EQUATION = 'inline_equation' + EQUATION = 'equation' + + +class CategoryId: + Title = 0 + Text = 1 + Abandon = 2 + ImageBody = 3 + ImageCaption = 4 + TableBody = 5 + TableCaption = 6 + TableFootnote = 7 + InterlineEquation_Layout = 8 + InterlineEquationNumber_Layout = 9 + InlineEquation = 13 + InterlineEquation_YOLO = 14 + OcrText = 15 + LowScoreText = 16 + ImageFootnote = 101 + + +class MakeMode: + MM_MD = 'mm_markdown' + NLP_MD = 'nlp_markdown' + CONTENT_LIST = 'content_list' + + +class ModelPath: + vlm_root_hf = "opendatalab/MinerU2.0-2505-0.9B" + vlm_root_modelscope = "OpenDataLab/MinerU2.0-2505-0.9B" + pipeline_root_modelscope = "OpenDataLab/PDF-Extract-Kit-1.0" + pipeline_root_hf = "opendatalab/PDF-Extract-Kit-1.0" + doclayout_yolo = "models/Layout/YOLO/doclayout_yolo_docstructbench_imgsz1280_2501.pt" + yolo_v8_mfd = "models/MFD/YOLO/yolo_v8_ft.pt" + unimernet_small = "models/MFR/unimernet_hf_small_2503" + pytorch_paddle = "models/OCR/paddleocr_torch" + layout_reader = "models/ReadingOrder/layout_reader" + slanet_plus = "models/TabRec/SlanetPlus/slanet-plus.onnx" + + +class SplitFlag: + CROSS_PAGE = 'cross_page' + LINES_DELETED = 'lines_deleted' \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/format_utils.py b/vendor/mineru/mineru/utils/format_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7462f64946f8bb8734b34bc447ec2fc602bf0816 --- /dev/null +++ b/vendor/mineru/mineru/utils/format_utils.py @@ -0,0 +1,319 @@ + +import re +import itertools +import html +from typing import Any, Dict, List +from pydantic import ( + BaseModel, + computed_field, + model_validator, +) + +class TableCell(BaseModel): + """TableCell.""" + row_span: int = 1 + col_span: int = 1 + start_row_offset_idx: int + end_row_offset_idx: int + start_col_offset_idx: int + end_col_offset_idx: int + text: str + column_header: bool = False + row_header: bool = False + row_section: bool = False + + @model_validator(mode="before") + @classmethod + def from_dict_format(cls, data: Any) -> Any: + """from_dict_format.""" + if isinstance(data, Dict): + # Check if this is a native BoundingBox or a bbox from docling-ibm-models + if ( + # "bbox" not in data + # or data["bbox"] is None + # or isinstance(data["bbox"], BoundingBox) + "text" + in data + ): + return data + text = data["bbox"].get("token", "") + if not len(text): + text_cells = data.pop("text_cell_bboxes", None) + if text_cells: + for el in text_cells: + text += el["token"] + " " + + text = text.strip() + data["text"] = text + + return data + + +class TableData(BaseModel): # TBD + """BaseTableData.""" + + table_cells: List[TableCell] = [] + num_rows: int = 0 + num_cols: int = 0 + + @computed_field # type: ignore + @property + def grid( + self, + ) -> List[List[TableCell]]: + """grid.""" + # Initialise empty table data grid (only empty cells) + table_data = [ + [ + TableCell( + text="", + start_row_offset_idx=i, + end_row_offset_idx=i + 1, + start_col_offset_idx=j, + end_col_offset_idx=j + 1, + ) + for j in range(self.num_cols) + ] + for i in range(self.num_rows) + ] + + # Overwrite cells in table data for which there is actual cell content. + for cell in self.table_cells: + for i in range( + min(cell.start_row_offset_idx, self.num_rows), + min(cell.end_row_offset_idx, self.num_rows), + ): + for j in range( + min(cell.start_col_offset_idx, self.num_cols), + min(cell.end_col_offset_idx, self.num_cols), + ): + table_data[i][j] = cell + + return table_data + + +""" +OTSL +""" +OTSL_NL = "" +OTSL_FCEL = "" +OTSL_ECEL = "" +OTSL_LCEL = "" +OTSL_UCEL = "" +OTSL_XCEL = "" + + +def otsl_extract_tokens_and_text(s: str): + # Pattern to match anything enclosed by < > + # (including the angle brackets themselves) + # pattern = r"(<[^>]+>)" + pattern = r"(" + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]) + r")" + # Find all tokens (e.g. "", "", etc.) + tokens = re.findall(pattern, s) + # Remove any tokens that start with " to tags + for row_idx, row in enumerate(split_row_tokens): + while len(row) < max_cols: + row.append(OTSL_ECEL) + + # Insert additional to texts + new_texts = [] + text_idx = 0 + + for row_idx, row in enumerate(split_row_tokens): + for col_idx, token in enumerate(row): + new_texts.append(token) + if text_idx < len(texts) and texts[text_idx] == token: + text_idx += 1 + if (text_idx < len(texts) and + texts[text_idx] not in [OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]): + new_texts.append(texts[text_idx]) + text_idx += 1 + + new_texts.append(OTSL_NL) + if text_idx < len(texts) and texts[text_idx] == OTSL_NL: + text_idx += 1 + + texts = new_texts + + def count_right(tokens, c_idx, r_idx, which_tokens): + span = 0 + c_idx_iter = c_idx + while tokens[r_idx][c_idx_iter] in which_tokens: + c_idx_iter += 1 + span += 1 + if c_idx_iter >= len(tokens[r_idx]): + return span + return span + + def count_down(tokens, c_idx, r_idx, which_tokens): + span = 0 + r_idx_iter = r_idx + while tokens[r_idx_iter][c_idx] in which_tokens: + r_idx_iter += 1 + span += 1 + if r_idx_iter >= len(tokens): + return span + return span + + for i, text in enumerate(texts): + cell_text = "" + if text in [ + OTSL_FCEL, + OTSL_ECEL, + ]: + row_span = 1 + col_span = 1 + right_offset = 1 + if text != OTSL_ECEL: + cell_text = texts[i + 1] + right_offset = 2 + + # Check next element(s) for lcel / ucel / xcel, + # set properly row_span, col_span + next_right_cell = "" + if i + right_offset < len(texts): + next_right_cell = texts[i + right_offset] + + next_bottom_cell = "" + if r_idx + 1 < len(split_row_tokens): + if c_idx < len(split_row_tokens[r_idx + 1]): + next_bottom_cell = split_row_tokens[r_idx + 1][c_idx] + + if next_right_cell in [ + OTSL_LCEL, + OTSL_XCEL, + ]: + # we have horisontal spanning cell or 2d spanning cell + col_span += count_right( + split_row_tokens, + c_idx + 1, + r_idx, + [OTSL_LCEL, OTSL_XCEL], + ) + if next_bottom_cell in [ + OTSL_UCEL, + OTSL_XCEL, + ]: + # we have a vertical spanning cell or 2d spanning cell + row_span += count_down( + split_row_tokens, + c_idx, + r_idx + 1, + [OTSL_UCEL, OTSL_XCEL], + ) + + table_cells.append( + TableCell( + text=cell_text.strip(), + row_span=row_span, + col_span=col_span, + start_row_offset_idx=r_idx, + end_row_offset_idx=r_idx + row_span, + start_col_offset_idx=c_idx, + end_col_offset_idx=c_idx + col_span, + ) + ) + if text in [ + OTSL_FCEL, + OTSL_ECEL, + OTSL_LCEL, + OTSL_UCEL, + OTSL_XCEL, + ]: + c_idx += 1 + if text == OTSL_NL: + r_idx += 1 + c_idx = 0 + return table_cells, split_row_tokens + + +def export_to_html(table_data: TableData): + nrows = table_data.num_rows + ncols = table_data.num_cols + + text = "" + + if len(table_data.table_cells) == 0: + return "" + + body = "" + + grid = table_data.grid + for i in range(nrows): + body += "" + for j in range(ncols): + cell: TableCell = grid[i][j] + + rowspan, rowstart = ( + cell.row_span, + cell.start_row_offset_idx, + ) + colspan, colstart = ( + cell.col_span, + cell.start_col_offset_idx, + ) + + if rowstart != i: + continue + if colstart != j: + continue + + content = html.escape(cell.text.strip()) + celltag = "td" + if cell.column_header: + celltag = "th" + + opening_tag = f"{celltag}" + if rowspan > 1: + opening_tag += f' rowspan="{rowspan}"' + if colspan > 1: + opening_tag += f' colspan="{colspan}"' + + body += f"<{opening_tag}>{content}" + body += "" + + # dir = get_text_direction(text) + body = f"{body}
" + + return body + + +def convert_otsl_to_html(otsl_content: str): + tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content) + table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens) + + table_data = TableData( + num_rows=len(split_row_tokens), + num_cols=( + max(len(row) for row in split_row_tokens) if split_row_tokens else 0 + ), + table_cells=table_cells, + ) + + return export_to_html(table_data) diff --git a/vendor/mineru/mineru/utils/hash_utils.py b/vendor/mineru/mineru/utils/hash_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2b82c78dd8d8839698edaa28bb78cdd25aa2ac67 --- /dev/null +++ b/vendor/mineru/mineru/utils/hash_utils.py @@ -0,0 +1,30 @@ +# Copyright (c) Opendatalab. All rights reserved. +import hashlib +import json + + +def bytes_md5(file_bytes): + hasher = hashlib.md5() + hasher.update(file_bytes) + return hasher.hexdigest().upper() + + +def str_md5(input_string): + hasher = hashlib.md5() + # 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理 + input_bytes = input_string.encode('utf-8') + hasher.update(input_bytes) + return hasher.hexdigest() + + +def str_sha256(input_string): + hasher = hashlib.sha256() + # 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理 + input_bytes = input_string.encode('utf-8') + hasher.update(input_bytes) + return hasher.hexdigest() + + +def dict_md5(d): + json_str = json.dumps(d, sort_keys=True, ensure_ascii=False) + return hashlib.md5(json_str.encode('utf-8')).hexdigest() \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/language.py b/vendor/mineru/mineru/utils/language.py new file mode 100644 index 0000000000000000000000000000000000000000..73d382b7c436f8c0a8a7498e4ea1584b0719e8a5 --- /dev/null +++ b/vendor/mineru/mineru/utils/language.py @@ -0,0 +1,48 @@ +import os +import unicodedata + +if not os.getenv("FTLANG_CACHE"): + current_file_path = os.path.abspath(__file__) + current_dir = os.path.dirname(current_file_path) + root_dir = os.path.dirname(current_dir) + ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect') + os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir) + # print(os.getenv("FTLANG_CACHE")) + +from fast_langdetect import detect_language + + +def remove_invalid_surrogates(text): + # 移除无效的 UTF-16 代理对 + return ''.join(c for c in text if not (0xD800 <= ord(c) <= 0xDFFF)) + + +def detect_lang(text: str) -> str: + + if len(text) == 0: + return "" + + text = text.replace("\n", "") + text = remove_invalid_surrogates(text) + + # print(text) + try: + lang_upper = detect_language(text) + except: + html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) + lang_upper = detect_language(html_no_ctrl_chars) + + try: + lang = lang_upper.lower() + except: + lang = "" + return lang + + +if __name__ == '__main__': + print(os.getenv("FTLANG_CACHE")) + print(detect_lang("This is a test.")) + print(detect_lang("This is a test")) + print(detect_lang("这个是中文测试。")) + print(detect_lang("这个是中文测试。")) + print(detect_lang("〖\ud835\udc46\ud835〗这是个包含utf-16的中文测试")) \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/llm_aided.py b/vendor/mineru/mineru/utils/llm_aided.py new file mode 100644 index 0000000000000000000000000000000000000000..bdc8aba9a881540c3f7a2b94a6781c2835da25cd --- /dev/null +++ b/vendor/mineru/mineru/utils/llm_aided.py @@ -0,0 +1,121 @@ +# Copyright (c) Opendatalab. All rights reserved. +from loguru import logger +from openai import OpenAI +import json_repair + +from mineru.backend.pipeline.pipeline_middle_json_mkcontent import merge_para_with_text + + +def llm_aided_title(page_info_list, title_aided_config): + client = OpenAI( + api_key=title_aided_config["api_key"], + base_url=title_aided_config["base_url"], + ) + title_dict = {} + origin_title_list = [] + i = 0 + for page_info in page_info_list: + blocks = page_info["para_blocks"] + for block in blocks: + if block["type"] == "title": + origin_title_list.append(block) + title_text = merge_para_with_text(block) + + if 'line_avg_height' in block: + line_avg_height = block['line_avg_height'] + else: + title_block_line_height_list = [] + for line in block['lines']: + bbox = line['bbox'] + title_block_line_height_list.append(int(bbox[3] - bbox[1])) + if len(title_block_line_height_list) > 0: + line_avg_height = sum(title_block_line_height_list) / len(title_block_line_height_list) + else: + line_avg_height = int(block['bbox'][3] - block['bbox'][1]) + + title_dict[f"{i}"] = [title_text, line_avg_height, int(page_info['page_idx']) + 1] + i += 1 + # logger.info(f"Title list: {title_dict}") + + title_optimize_prompt = f"""输入的内容是一篇文档中所有标题组成的字典,请根据以下指南优化标题的结果,使结果符合正常文档的层次结构: + +1. 字典中每个value均为一个list,包含以下元素: + - 标题文本 + - 文本行高是标题所在块的平均行高 + - 标题所在的页码 + +2. 保留原始内容: + - 输入的字典中所有元素都是有效的,不能删除字典中的任何元素 + - 请务必保证输出的字典中元素的数量和输入的数量一致 + +3. 保持字典内key-value的对应关系不变 + +4. 优化层次结构: + - 为每个标题元素添加适当的层次结构 + - 行高较大的标题一般是更高级别的标题 + - 标题从前至后的层级必须是连续的,不能跳过层级 + - 标题层级最多为4级,不要添加过多的层级 + - 优化后的标题只保留代表该标题的层级的整数,不要保留其他信息 + +5. 合理性检查与微调: + - 在完成初步分级后,仔细检查分级结果的合理性 + - 根据上下文关系和逻辑顺序,对不合理的分级进行微调 + - 确保最终的分级结果符合文档的实际结构和逻辑 + - 字典中可能包含被误当成标题的正文,你可以通过将其层级标记为 0 来排除它们 + +IMPORTANT: +请直接返回优化过的由标题层级组成的字典,格式为{{标题id:标题层级}},如下: +{{ + 0:1, + 1:2, + 2:2, + 3:3 +}} +不需要对字典格式化,不需要返回任何其他信息。 + +Input title list: +{title_dict} + +Corrected title list: +""" + + retry_count = 0 + max_retries = 3 + dict_completion = None + + while retry_count < max_retries: + try: + completion = client.chat.completions.create( + model=title_aided_config["model"], + messages=[ + {'role': 'user', 'content': title_optimize_prompt}], + temperature=0.7, + stream=True, + ) + content_pieces = [] + for chunk in completion: + if chunk.choices and chunk.choices[0].delta.content is not None: + content_pieces.append(chunk.choices[0].delta.content) + content = "".join(content_pieces).strip() + # logger.info(f"Title completion: {content}") + if "" in content: + idx = content.index("") + len("") + content = content[idx:].strip() + dict_completion = json_repair.loads(content) + dict_completion = {int(k): int(v) for k, v in dict_completion.items()} + + # logger.info(f"len(dict_completion): {len(dict_completion)}, len(title_dict): {len(title_dict)}") + if len(dict_completion) == len(title_dict): + for i, origin_title_block in enumerate(origin_title_list): + origin_title_block["level"] = int(dict_completion[i]) + break + else: + logger.warning( + "The number of titles in the optimized result is not equal to the number of titles in the input.") + retry_count += 1 + except Exception as e: + logger.exception(e) + retry_count += 1 + + if dict_completion is None: + logger.error("Failed to decode dict after maximum retries.") diff --git a/vendor/mineru/mineru/utils/model_utils.py b/vendor/mineru/mineru/utils/model_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7b27bc0b43da05b4940c461e7ca42f7bbaa5c6f1 --- /dev/null +++ b/vendor/mineru/mineru/utils/model_utils.py @@ -0,0 +1,336 @@ +import time +import gc +from PIL import Image +from loguru import logger +import numpy as np + +from mineru.utils.boxbase import get_minbox_if_overlap_by_ratio + +try: + import torch + import torch_npu +except ImportError: + pass + + +def crop_img(input_res, input_img, crop_paste_x=0, crop_paste_y=0): + + crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1]) + crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5]) + + # Calculate new dimensions + crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2 + crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2 + + if isinstance(input_img, np.ndarray): + + # Create a white background array + return_image = np.ones((crop_new_height, crop_new_width, 3), dtype=np.uint8) * 255 + + # Crop the original image using numpy slicing + cropped_img = input_img[crop_ymin:crop_ymax, crop_xmin:crop_xmax] + + # Paste the cropped image onto the white background + return_image[crop_paste_y:crop_paste_y + (crop_ymax - crop_ymin), + crop_paste_x:crop_paste_x + (crop_xmax - crop_xmin)] = cropped_img + else: + # Create a white background array + return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white') + # Crop image + crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax) + cropped_img = input_img.crop(crop_box) + return_image.paste(cropped_img, (crop_paste_x, crop_paste_y)) + + return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, + crop_new_height] + return return_image, return_list + + +def get_coords_and_area(block_with_poly): + """Extract coordinates and area from a table.""" + xmin, ymin = int(block_with_poly['poly'][0]), int(block_with_poly['poly'][1]) + xmax, ymax = int(block_with_poly['poly'][4]), int(block_with_poly['poly'][5]) + area = (xmax - xmin) * (ymax - ymin) + return xmin, ymin, xmax, ymax, area + + +def calculate_intersection(box1, box2): + """Calculate intersection coordinates between two boxes.""" + intersection_xmin = max(box1[0], box2[0]) + intersection_ymin = max(box1[1], box2[1]) + intersection_xmax = min(box1[2], box2[2]) + intersection_ymax = min(box1[3], box2[3]) + + # Check if intersection is valid + if intersection_xmax <= intersection_xmin or intersection_ymax <= intersection_ymin: + return None + + return intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax + + +def calculate_iou(box1, box2): + """Calculate IoU between two boxes.""" + intersection = calculate_intersection(box1[:4], box2[:4]) + + if not intersection: + return 0 + + intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection + intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin) + + area1, area2 = box1[4], box2[4] + union_area = area1 + area2 - intersection_area + + return intersection_area / union_area if union_area > 0 else 0 + + +def is_inside(small_box, big_box, overlap_threshold=0.8): + """Check if small_box is inside big_box by at least overlap_threshold.""" + intersection = calculate_intersection(small_box[:4], big_box[:4]) + + if not intersection: + return False + + intersection_xmin, intersection_ymin, intersection_xmax, intersection_ymax = intersection + intersection_area = (intersection_xmax - intersection_xmin) * (intersection_ymax - intersection_ymin) + + # Check if overlap exceeds threshold + return intersection_area >= overlap_threshold * small_box[4] + + +def do_overlap(box1, box2): + """Check if two boxes overlap.""" + return calculate_intersection(box1[:4], box2[:4]) is not None + + +def merge_high_iou_tables(table_res_list, layout_res, table_indices, iou_threshold=0.7): + """Merge tables with IoU > threshold.""" + if len(table_res_list) < 2: + return table_res_list, table_indices + + table_info = [get_coords_and_area(table) for table in table_res_list] + merged = True + + while merged: + merged = False + i = 0 + while i < len(table_res_list) - 1: + j = i + 1 + while j < len(table_res_list): + iou = calculate_iou(table_info[i], table_info[j]) + + if iou > iou_threshold: + # Merge tables by taking their union + x1_min, y1_min, x1_max, y1_max, _ = table_info[i] + x2_min, y2_min, x2_max, y2_max, _ = table_info[j] + + union_xmin = min(x1_min, x2_min) + union_ymin = min(y1_min, y2_min) + union_xmax = max(x1_max, x2_max) + union_ymax = max(y1_max, y2_max) + + # Create merged table + merged_table = table_res_list[i].copy() + merged_table['poly'][0] = union_xmin + merged_table['poly'][1] = union_ymin + merged_table['poly'][2] = union_xmax + merged_table['poly'][3] = union_ymin + merged_table['poly'][4] = union_xmax + merged_table['poly'][5] = union_ymax + merged_table['poly'][6] = union_xmin + merged_table['poly'][7] = union_ymax + + # Update layout_res + to_remove = [table_indices[j], table_indices[i]] + for idx in sorted(to_remove, reverse=True): + del layout_res[idx] + layout_res.append(merged_table) + + # Update tracking lists + table_indices = [k if k < min(to_remove) else + k - 1 if k < max(to_remove) else + k - 2 if k > max(to_remove) else + len(layout_res) - 1 + for k in table_indices + if k not in to_remove] + table_indices.append(len(layout_res) - 1) + + # Update table lists + table_res_list.pop(j) + table_res_list.pop(i) + table_res_list.append(merged_table) + + # Update table_info + table_info = [get_coords_and_area(table) for table in table_res_list] + + merged = True + break + j += 1 + + if merged: + break + i += 1 + + return table_res_list, table_indices + + +def filter_nested_tables(table_res_list, overlap_threshold=0.8, area_threshold=0.8): + """Remove big tables containing multiple smaller tables within them.""" + if len(table_res_list) < 3: + return table_res_list + + table_info = [get_coords_and_area(table) for table in table_res_list] + big_tables_idx = [] + + for i in range(len(table_res_list)): + # Find tables inside this one + tables_inside = [j for j in range(len(table_res_list)) + if i != j and is_inside(table_info[j], table_info[i], overlap_threshold)] + + # Continue if there are at least 3 tables inside + if len(tables_inside) >= 3: + # Check if inside tables overlap with each other + tables_overlap = any(do_overlap(table_info[tables_inside[idx1]], table_info[tables_inside[idx2]]) + for idx1 in range(len(tables_inside)) + for idx2 in range(idx1 + 1, len(tables_inside))) + + # If no overlaps, check area condition + if not tables_overlap: + total_inside_area = sum(table_info[j][4] for j in tables_inside) + big_table_area = table_info[i][4] + + if total_inside_area > area_threshold * big_table_area: + big_tables_idx.append(i) + + return [table for i, table in enumerate(table_res_list) if i not in big_tables_idx] + + +def remove_overlaps_min_blocks(res_list): + # 重叠block,小的不能直接删除,需要和大的那个合并成一个更大的。 + # 删除重叠blocks中较小的那些 + need_remove = [] + for res1 in res_list: + for res2 in res_list: + if res1 != res2: + overlap_box = get_minbox_if_overlap_by_ratio( + res1['bbox'], res2['bbox'], 0.8 + ) + if overlap_box is not None: + res_to_remove = next( + (res for res in res_list if res['bbox'] == overlap_box), + None, + ) + if ( + res_to_remove is not None + and res_to_remove not in need_remove + ): + large_res = res1 if res1 != res_to_remove else res2 + x1, y1, x2, y2 = large_res['bbox'] + sx1, sy1, sx2, sy2 = res_to_remove['bbox'] + x1 = min(x1, sx1) + y1 = min(y1, sy1) + x2 = max(x2, sx2) + y2 = max(y2, sy2) + large_res['bbox'] = [x1, y1, x2, y2] + need_remove.append(res_to_remove) + + if len(need_remove) > 0: + for res in need_remove: + res_list.remove(res) + + return res_list, need_remove + + +def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshold=0.8, area_threshold=0.8): + """Extract OCR, table and other regions from layout results.""" + ocr_res_list = [] + text_res_list = [] + table_res_list = [] + table_indices = [] + single_page_mfdetrec_res = [] + + # Categorize regions + for i, res in enumerate(layout_res): + category_id = int(res['category_id']) + + if category_id in [13, 14]: # Formula regions + single_page_mfdetrec_res.append({ + "bbox": [int(res['poly'][0]), int(res['poly'][1]), + int(res['poly'][4]), int(res['poly'][5])], + }) + elif category_id in [0, 2, 4, 6, 7, 3]: # OCR regions + ocr_res_list.append(res) + elif category_id == 5: # Table regions + table_res_list.append(res) + table_indices.append(i) + elif category_id in [1]: # Text regions + res['bbox'] = [int(res['poly'][0]), int(res['poly'][1]), int(res['poly'][4]), int(res['poly'][5])] + text_res_list.append(res) + + # Process tables: merge high IoU tables first, then filter nested tables + table_res_list, table_indices = merge_high_iou_tables( + table_res_list, layout_res, table_indices, iou_threshold) + + filtered_table_res_list = filter_nested_tables( + table_res_list, overlap_threshold, area_threshold) + + # Remove filtered out tables from layout_res + if len(filtered_table_res_list) < len(table_res_list): + kept_tables = set(id(table) for table in filtered_table_res_list) + to_remove = [table_indices[i] for i, table in enumerate(table_res_list) + if id(table) not in kept_tables] + + for idx in sorted(to_remove, reverse=True): + del layout_res[idx] + + # Remove overlaps in OCR and text regions + text_res_list, need_remove = remove_overlaps_min_blocks(text_res_list) + for res in text_res_list: + # 将res的poly使用bbox重构 + res['poly'] = [res['bbox'][0], res['bbox'][1], res['bbox'][2], res['bbox'][1], + res['bbox'][2], res['bbox'][3], res['bbox'][0], res['bbox'][3]] + # 删除res的bbox + del res['bbox'] + + ocr_res_list.extend(text_res_list) + + if len(need_remove) > 0: + for res in need_remove: + del res['bbox'] + layout_res.remove(res) + + return ocr_res_list, filtered_table_res_list, single_page_mfdetrec_res + + +def clean_memory(device='cuda'): + if device == 'cuda': + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + elif str(device).startswith("npu"): + if torch_npu.npu.is_available(): + torch_npu.npu.empty_cache() + elif str(device).startswith("mps"): + torch.mps.empty_cache() + gc.collect() + + +def clean_vram(device, vram_threshold=8): + total_memory = get_vram(device) + if total_memory and total_memory <= vram_threshold: + gc_start = time.time() + clean_memory(device) + gc_time = round(time.time() - gc_start, 2) + logger.info(f"gc time: {gc_time}") + + +def get_vram(device): + if torch.cuda.is_available() and str(device).startswith("cuda"): + total_memory = torch.cuda.get_device_properties(device).total_memory / (1024 ** 3) # 将字节转换为 GB + return total_memory + elif str(device).startswith("npu"): + if torch_npu.npu.is_available(): + total_memory = torch_npu.npu.get_device_properties(device).total_memory / (1024 ** 3) # 转为 GB + return total_memory + else: + return None \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/models_download_utils.py b/vendor/mineru/mineru/utils/models_download_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..163e414711bb247ff40d7d8207fb718f9d9fd52c --- /dev/null +++ b/vendor/mineru/mineru/utils/models_download_utils.py @@ -0,0 +1,75 @@ +import os +from huggingface_hub import snapshot_download as hf_snapshot_download +from modelscope import snapshot_download as ms_snapshot_download + +from mineru.utils.config_reader import get_local_models_dir +from mineru.utils.enum_class import ModelPath + +def auto_download_and_get_model_root_path(relative_path: str, repo_mode='pipeline') -> str: + """ + 支持文件或目录的可靠下载。 + - 如果输入文件: 返回本地文件绝对路径 + - 如果输入目录: 返回本地缓存下与 relative_path 同结构的相对路径字符串 + :param repo_mode: 指定仓库模式,'pipeline' 或 'vlm' + :param relative_path: 文件或目录相对路径 + :return: 本地文件绝对路径或相对路径 + """ + model_source = os.getenv('MINERU_MODEL_SOURCE', "huggingface") + + if model_source == 'local': + local_models_config = get_local_models_dir() + root_path = local_models_config.get(repo_mode, None) + if not root_path: + raise ValueError(f"Local path for repo_mode '{repo_mode}' is not configured.") + return root_path + + # 建立仓库模式到路径的映射 + repo_mapping = { + 'pipeline': { + 'huggingface': ModelPath.pipeline_root_hf, + 'modelscope': ModelPath.pipeline_root_modelscope, + 'default': ModelPath.pipeline_root_hf + }, + 'vlm': { + 'huggingface': ModelPath.vlm_root_hf, + 'modelscope': ModelPath.vlm_root_modelscope, + 'default': ModelPath.vlm_root_hf + } + } + + if repo_mode not in repo_mapping: + raise ValueError(f"Unsupported repo_mode: {repo_mode}, must be 'pipeline' or 'vlm'") + + # 如果没有指定model_source或值不是'modelscope',则使用默认值 + repo = repo_mapping[repo_mode].get(model_source, repo_mapping[repo_mode]['default']) + + + if model_source == "huggingface": + snapshot_download = hf_snapshot_download + elif model_source == "modelscope": + snapshot_download = ms_snapshot_download + else: + raise ValueError(f"未知的仓库类型: {model_source}") + + cache_dir = None + + if repo_mode == 'pipeline': + relative_path = relative_path.strip('/') + cache_dir = snapshot_download(repo, allow_patterns=[relative_path, relative_path+"/*"]) + elif repo_mode == 'vlm': + # VLM 模式下,根据 relative_path 的不同处理方式 + if relative_path == "/": + cache_dir = snapshot_download(repo) + else: + relative_path = relative_path.strip('/') + cache_dir = snapshot_download(repo, allow_patterns=[relative_path, relative_path+"/*"]) + + if not cache_dir: + raise FileNotFoundError(f"Failed to download model: {relative_path} from {repo}") + return cache_dir + + +if __name__ == '__main__': + path1 = "models/README.md" + root = auto_download_and_get_model_root_path(path1) + print("本地文件绝对路径:", os.path.join(root, path1)) \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/ocr_utils.py b/vendor/mineru/mineru/utils/ocr_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..aedc3fc0969a3f03206e21cafd445380756008d3 --- /dev/null +++ b/vendor/mineru/mineru/utils/ocr_utils.py @@ -0,0 +1,442 @@ +# Copyright (c) Opendatalab. All rights reserved. +import copy +import cv2 +import numpy as np + + +class OcrConfidence: + min_confidence = 0.5 + min_width = 3 + +LINE_WIDTH_TO_HEIGHT_RATIO_THRESHOLD = 4 # 一般情况下,行宽度超过高度4倍时才是一个正常的横向文本块 + + +def merge_spans_to_line(spans, threshold=0.6): + if len(spans) == 0: + return [] + else: + # 按照y0坐标排序 + spans.sort(key=lambda span: span['bbox'][1]) + + lines = [] + current_line = [spans[0]] + for span in spans[1:]: + # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 + if _is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold): + current_line.append(span) + else: + # 否则,开始新行 + lines.append(current_line) + current_line = [span] + + # 添加最后一行 + if current_line: + lines.append(current_line) + + return lines + +def _is_overlaps_y_exceeds_threshold(bbox1, + bbox2, + overlap_ratio_threshold=0.8): + """检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%""" + _, y0_1, _, y1_1 = bbox1 + _, y0_2, _, y1_2 = bbox2 + + overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2)) + height1, height2 = y1_1 - y0_1, y1_2 - y0_2 + # max_height = max(height1, height2) + min_height = min(height1, height2) + + return (overlap / min_height) > overlap_ratio_threshold if min_height > 0 else False + + +def _is_overlaps_x_exceeds_threshold(bbox1, + bbox2, + overlap_ratio_threshold=0.8): + """检查两个bbox在x轴上是否有重叠,并且该重叠区域的宽度占两个bbox宽度更低的那个超过指定阈值""" + x0_1, _, x1_1, _ = bbox1 + x0_2, _, x1_2, _ = bbox2 + + overlap = max(0, min(x1_1, x1_2) - max(x0_1, x0_2)) + width1, width2 = x1_1 - x0_1, x1_2 - x0_2 + min_width = min(width1, width2) + + return (overlap / min_width) > overlap_ratio_threshold if min_width > 0 else False + + +def img_decode(content: bytes): + np_arr = np.frombuffer(content, dtype=np.uint8) + return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED) + +def check_img(img): + if isinstance(img, bytes): + img = img_decode(img) + if isinstance(img, np.ndarray) and len(img.shape) == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + return img + + +def alpha_to_color(img, alpha_color=(255, 255, 255)): + if len(img.shape) == 3 and img.shape[2] == 4: + B, G, R, A = cv2.split(img) + alpha = A / 255 + + R = (alpha_color[0] * (1 - alpha) + R * alpha).astype(np.uint8) + G = (alpha_color[1] * (1 - alpha) + G * alpha).astype(np.uint8) + B = (alpha_color[2] * (1 - alpha) + B * alpha).astype(np.uint8) + + img = cv2.merge((B, G, R)) + return img + + +def preprocess_image(_image): + alpha_color = (255, 255, 255) + _image = alpha_to_color(_image, alpha_color) + return _image + + +def sorted_boxes(dt_boxes): + """ + Sort text boxes in order from top to bottom, left to right + args: + dt_boxes(array):detected text boxes with shape [4, 2] + return: + sorted boxes(array) with shape [4, 2] + """ + num_boxes = dt_boxes.shape[0] + sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) + _boxes = list(sorted_boxes) + + for i in range(num_boxes - 1): + for j in range(i, -1, -1): + if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ + (_boxes[j + 1][0][0] < _boxes[j][0][0]): + tmp = _boxes[j] + _boxes[j] = _boxes[j + 1] + _boxes[j + 1] = tmp + else: + break + return _boxes + + +def bbox_to_points(bbox): + """ 将bbox格式转换为四个顶点的数组 """ + x0, y0, x1, y1 = bbox + return np.array([[x0, y0], [x1, y0], [x1, y1], [x0, y1]]).astype('float32') + + +def points_to_bbox(points): + """ 将四个顶点的数组转换为bbox格式 """ + x0, y0 = points[0] + x1, _ = points[1] + _, y1 = points[2] + return [x0, y0, x1, y1] + + +def merge_intervals(intervals): + # Sort the intervals based on the start value + intervals.sort(key=lambda x: x[0]) + + merged = [] + for interval in intervals: + # If the list of merged intervals is empty or if the current + # interval does not overlap with the previous, simply append it. + if not merged or merged[-1][1] < interval[0]: + merged.append(interval) + else: + # Otherwise, there is overlap, so we merge the current and previous intervals. + merged[-1][1] = max(merged[-1][1], interval[1]) + + return merged + + +def remove_intervals(original, masks): + # Merge all mask intervals + merged_masks = merge_intervals(masks) + + result = [] + original_start, original_end = original + + for mask in merged_masks: + mask_start, mask_end = mask + + # If the mask starts after the original range, ignore it + if mask_start > original_end: + continue + + # If the mask ends before the original range starts, ignore it + if mask_end < original_start: + continue + + # Remove the masked part from the original range + if original_start < mask_start: + result.append([original_start, mask_start - 1]) + + original_start = max(mask_end + 1, original_start) + + # Add the remaining part of the original range, if any + if original_start <= original_end: + result.append([original_start, original_end]) + + return result + + +def update_det_boxes(dt_boxes, mfd_res): + new_dt_boxes = [] + angle_boxes_list = [] + for text_box in dt_boxes: + + if calculate_is_angle(text_box): + angle_boxes_list.append(text_box) + continue + + text_bbox = points_to_bbox(text_box) + masks_list = [] + for mf_box in mfd_res: + mf_bbox = mf_box['bbox'] + if _is_overlaps_y_exceeds_threshold(text_bbox, mf_bbox): + masks_list.append([mf_bbox[0], mf_bbox[2]]) + text_x_range = [text_bbox[0], text_bbox[2]] + text_remove_mask_range = remove_intervals(text_x_range, masks_list) + temp_dt_box = [] + for text_remove_mask in text_remove_mask_range: + temp_dt_box.append(bbox_to_points([text_remove_mask[0], text_bbox[1], text_remove_mask[1], text_bbox[3]])) + if len(temp_dt_box) > 0: + new_dt_boxes.extend(temp_dt_box) + + new_dt_boxes.extend(angle_boxes_list) + + return new_dt_boxes + + +def merge_overlapping_spans(spans): + """ + Merges overlapping spans on the same line. + + :param spans: A list of span coordinates [(x1, y1, x2, y2), ...] + :return: A list of merged spans + """ + # Return an empty list if the input spans list is empty + if not spans: + return [] + + # Sort spans by their starting x-coordinate + spans.sort(key=lambda x: x[0]) + + # Initialize the list of merged spans + merged = [] + for span in spans: + # Unpack span coordinates + x1, y1, x2, y2 = span + # If the merged list is empty or there's no horizontal overlap, add the span directly + if not merged or merged[-1][2] < x1: + merged.append(span) + else: + # If there is horizontal overlap, merge the current span with the previous one + last_span = merged.pop() + # Update the merged span's top-left corner to the smaller (x1, y1) and bottom-right to the larger (x2, y2) + x1 = min(last_span[0], x1) + y1 = min(last_span[1], y1) + x2 = max(last_span[2], x2) + y2 = max(last_span[3], y2) + # Add the merged span back to the list + merged.append((x1, y1, x2, y2)) + + # Return the list of merged spans + return merged + + +def merge_det_boxes(dt_boxes): + """ + Merge detection boxes. + + This function takes a list of detected bounding boxes, each represented by four corner points. + The goal is to merge these bounding boxes into larger text regions. + + Parameters: + dt_boxes (list): A list containing multiple text detection boxes, where each box is defined by four corner points. + + Returns: + list: A list containing the merged text regions, where each region is represented by four corner points. + """ + # Convert the detection boxes into a dictionary format with bounding boxes and type + dt_boxes_dict_list = [] + angle_boxes_list = [] + for text_box in dt_boxes: + text_bbox = points_to_bbox(text_box) + + if calculate_is_angle(text_box): + angle_boxes_list.append(text_box) + continue + + text_box_dict = {'bbox': text_bbox} + dt_boxes_dict_list.append(text_box_dict) + + # Merge adjacent text regions into lines + lines = merge_spans_to_line(dt_boxes_dict_list) + + # Initialize a new list for storing the merged text regions + new_dt_boxes = [] + for line in lines: + line_bbox_list = [] + for span in line: + line_bbox_list.append(span['bbox']) + + # 计算整行的宽度和高度 + min_x = min(bbox[0] for bbox in line_bbox_list) + max_x = max(bbox[2] for bbox in line_bbox_list) + min_y = min(bbox[1] for bbox in line_bbox_list) + max_y = max(bbox[3] for bbox in line_bbox_list) + line_width = max_x - min_x + line_height = max_y - min_y + + # 只有当行宽度超过高度4倍时才进行合并 + if line_width > line_height * LINE_WIDTH_TO_HEIGHT_RATIO_THRESHOLD: + + # Merge overlapping text regions within the same line + merged_spans = merge_overlapping_spans(line_bbox_list) + + # Convert the merged text regions back to point format and add them to the new detection box list + for span in merged_spans: + new_dt_boxes.append(bbox_to_points(span)) + else: + # 不进行合并,直接添加原始区域 + for bbox in line_bbox_list: + new_dt_boxes.append(bbox_to_points(bbox)) + + new_dt_boxes.extend(angle_boxes_list) + + return new_dt_boxes + + +def get_adjusted_mfdetrec_res(single_page_mfdetrec_res, useful_list): + paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list + # Adjust the coordinates of the formula area + adjusted_mfdetrec_res = [] + for mf_res in single_page_mfdetrec_res: + mf_xmin, mf_ymin, mf_xmax, mf_ymax = mf_res["bbox"] + # Adjust the coordinates of the formula area to the coordinates relative to the cropping area + x0 = mf_xmin - xmin + paste_x + y0 = mf_ymin - ymin + paste_y + x1 = mf_xmax - xmin + paste_x + y1 = mf_ymax - ymin + paste_y + # Filter formula blocks outside the graph + if any([x1 < 0, y1 < 0]) or any([x0 > new_width, y0 > new_height]): + continue + else: + adjusted_mfdetrec_res.append({ + "bbox": [x0, y0, x1, y1], + }) + return adjusted_mfdetrec_res + + +def get_ocr_result_list(ocr_res, useful_list, ocr_enable, new_image, lang): + paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list + ocr_result_list = [] + ori_im = new_image.copy() + for box_ocr_res in ocr_res: + + if len(box_ocr_res) == 2: + p1, p2, p3, p4 = box_ocr_res[0] + text, score = box_ocr_res[1] + # logger.info(f"text: {text}, score: {score}") + if score < OcrConfidence.min_confidence: # 过滤低置信度的结果 + continue + else: + p1, p2, p3, p4 = box_ocr_res + text, score = "", 1 + + if ocr_enable: + tmp_box = copy.deepcopy(np.array([p1, p2, p3, p4]).astype('float32')) + img_crop = get_rotate_crop_image(ori_im, tmp_box) + + # average_angle_degrees = calculate_angle_degrees(box_ocr_res[0]) + # if average_angle_degrees > 0.5: + poly = [p1, p2, p3, p4] + + if (p3[0] - p1[0]) < OcrConfidence.min_width: + # logger.info(f"width too small: {p3[0] - p1[0]}, text: {text}") + continue + + if calculate_is_angle(poly): + # logger.info(f"average_angle_degrees: {average_angle_degrees}, text: {text}") + # 与x轴的夹角超过0.5度,对边界做一下矫正 + # 计算几何中心 + x_center = sum(point[0] for point in poly) / 4 + y_center = sum(point[1] for point in poly) / 4 + new_height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2 + new_width = p3[0] - p1[0] + p1 = [x_center - new_width / 2, y_center - new_height / 2] + p2 = [x_center + new_width / 2, y_center - new_height / 2] + p3 = [x_center + new_width / 2, y_center + new_height / 2] + p4 = [x_center - new_width / 2, y_center + new_height / 2] + + # Convert the coordinates back to the original coordinate system + p1 = [p1[0] - paste_x + xmin, p1[1] - paste_y + ymin] + p2 = [p2[0] - paste_x + xmin, p2[1] - paste_y + ymin] + p3 = [p3[0] - paste_x + xmin, p3[1] - paste_y + ymin] + p4 = [p4[0] - paste_x + xmin, p4[1] - paste_y + ymin] + + if ocr_enable: + ocr_result_list.append({ + 'category_id': 15, + 'poly': p1 + p2 + p3 + p4, + 'score': 1, + 'text': text, + 'np_img': img_crop, + 'lang': lang, + }) + else: + ocr_result_list.append({ + 'category_id': 15, + 'poly': p1 + p2 + p3 + p4, + 'score': float(round(score, 2)), + 'text': text, + }) + + return ocr_result_list + + +def calculate_is_angle(poly): + p1, p2, p3, p4 = poly + height = ((p4[1] - p1[1]) + (p3[1] - p2[1])) / 2 + if 0.8 * height <= (p3[1] - p1[1]) <= 1.2 * height: + return False + else: + # logger.info((p3[1] - p1[1])/height) + return True + + +def get_rotate_crop_image(img, points): + ''' + img_height, img_width = img.shape[0:2] + left = int(np.min(points[:, 0])) + right = int(np.max(points[:, 0])) + top = int(np.min(points[:, 1])) + bottom = int(np.max(points[:, 1])) + img_crop = img[top:bottom, left:right, :].copy() + points[:, 0] = points[:, 0] - left + points[:, 1] = points[:, 1] - top + ''' + assert len(points) == 4, "shape of points must be 4*2" + img_crop_width = int( + max( + np.linalg.norm(points[0] - points[1]), + np.linalg.norm(points[2] - points[3]))) + img_crop_height = int( + max( + np.linalg.norm(points[0] - points[3]), + np.linalg.norm(points[1] - points[2]))) + pts_std = np.float32([[0, 0], [img_crop_width, 0], + [img_crop_width, img_crop_height], + [0, img_crop_height]]) + M = cv2.getPerspectiveTransform(points, pts_std) + dst_img = cv2.warpPerspective( + img, + M, (img_crop_width, img_crop_height), + borderMode=cv2.BORDER_REPLICATE, + flags=cv2.INTER_CUBIC) + dst_img_height, dst_img_width = dst_img.shape[0:2] + if dst_img_height * 1.0 / dst_img_width >= 1.5: + dst_img = np.rot90(dst_img) + return dst_img \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/pdf_classify.py b/vendor/mineru/mineru/utils/pdf_classify.py new file mode 100644 index 0000000000000000000000000000000000000000..1e98d2eea917ab8c483911089e487676b0c38561 --- /dev/null +++ b/vendor/mineru/mineru/utils/pdf_classify.py @@ -0,0 +1,260 @@ +# Copyright (c) Opendatalab. All rights reserved. +import re +from io import BytesIO +import numpy as np +import pypdfium2 as pdfium +from loguru import logger +from pdfminer.high_level import extract_text +from pdfminer.pdfparser import PDFParser +from pdfminer.pdfdocument import PDFDocument +from pdfminer.pdfpage import PDFPage +from pdfminer.pdfinterp import PDFResourceManager +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.layout import LAParams, LTImage, LTFigure +from pdfminer.converter import PDFPageAggregator + + +def classify(pdf_bytes): + """ + 判断PDF文件是可以直接提取文本还是需要OCR + + Args: + pdf_bytes: PDF文件的字节数据 + + Returns: + str: 'txt' 表示可以直接提取文本,'ocr' 表示需要OCR + """ + try: + # 从字节数据加载PDF + sample_pdf_bytes = extract_pages(pdf_bytes) + pdf = pdfium.PdfDocument(sample_pdf_bytes) + + # 获取PDF页数 + page_count = len(pdf) + + # 如果PDF页数为0,直接返回OCR + if page_count == 0: + return 'ocr' + + # 检查的页面数(最多检查10页) + pages_to_check = min(page_count, 10) + + # 设置阈值:如果每页平均少于50个有效字符,认为需要OCR + chars_threshold = 50 + + if (get_avg_cleaned_chars_per_page(pdf, pages_to_check) < chars_threshold) or detect_invalid_chars(sample_pdf_bytes): + return 'ocr' + else: + + if get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check) >= 0.8: + return 'ocr' + + return 'txt' + except Exception as e: + logger.error(f"判断PDF类型时出错: {e}") + # 出错时默认使用OCR + return 'ocr' + + +def get_avg_cleaned_chars_per_page(pdf_doc, pages_to_check): + # 总字符数 + total_chars = 0 + # 清理后的总字符数 + cleaned_total_chars = 0 + + # 检查前几页的文本 + for i in range(pages_to_check): + page = pdf_doc[i] + text_page = page.get_textpage() + text = text_page.get_text_bounded() + total_chars += len(text) + + # 清理提取的文本,移除空白字符 + cleaned_text = re.sub(r'\s+', '', text) + cleaned_total_chars += len(cleaned_text) + + # 计算平均每页字符数 + avg_cleaned_chars_per_page = cleaned_total_chars / pages_to_check + + # logger.debug(f"PDF分析: 平均每页清理后{avg_cleaned_chars_per_page:.1f}字符") + + pdf_doc.close() # 关闭PDF文档 + + return avg_cleaned_chars_per_page + + +def get_high_image_coverage_ratio(sample_pdf_bytes, pages_to_check): + # 创建内存文件对象 + pdf_stream = BytesIO(sample_pdf_bytes) + + # 创建PDF解析器 + parser = PDFParser(pdf_stream) + + # 创建PDF文档对象 + document = PDFDocument(parser) + + # 检查文档是否允许文本提取 + if not document.is_extractable: + # logger.warning("PDF不允许内容提取") + return 1.0 # 默认为高覆盖率,因为无法提取内容 + + # 创建资源管理器和参数对象 + rsrcmgr = PDFResourceManager() + laparams = LAParams( + line_overlap=0.5, + char_margin=2.0, + line_margin=0.5, + word_margin=0.1, + boxes_flow=None, + detect_vertical=False, + all_texts=False, + ) + + # 创建聚合器 + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + + # 创建解释器 + interpreter = PDFPageInterpreter(rsrcmgr, device) + + # 记录高图像覆盖率的页面数量 + high_image_coverage_pages = 0 + page_count = 0 + + # 遍历页面 + for page in PDFPage.create_pages(document): + # 控制检查的页数 + if page_count >= pages_to_check: + break + + # 处理页面 + interpreter.process_page(page) + layout = device.get_result() + + # 页面尺寸 + page_width = layout.width + page_height = layout.height + page_area = page_width * page_height + + # 计算图像覆盖的总面积 + image_area = 0 + + # 遍历页面元素 + for element in layout: + # 检查是否为图像或图形元素 + if isinstance(element, (LTImage, LTFigure)): + # 计算图像边界框面积 + img_width = element.width + img_height = element.height + img_area = img_width * img_height + image_area += img_area + + # 计算覆盖率 + coverage_ratio = min(image_area / page_area, 1.0) if page_area > 0 else 0 + # logger.debug(f"PDF分析: 页面 {page_count + 1} 图像覆盖率: {coverage_ratio:.2f}") + + # 判断是否为高覆盖率 + if coverage_ratio >= 0.8: # 使用80%作为高覆盖率的阈值 + high_image_coverage_pages += 1 + + page_count += 1 + + # 如果没有处理任何页面,返回0 + if page_count == 0: + return 0.0 + + # 计算高图像覆盖率的页面比例 + high_coverage_ratio = high_image_coverage_pages / page_count + # logger.debug(f"PDF分析: 高图像覆盖页面比例: {high_coverage_ratio:.2f}") + + # 关闭资源 + pdf_stream.close() + + return high_coverage_ratio + + +def extract_pages(src_pdf_bytes: bytes) -> bytes: + """ + 从PDF字节数据中随机提取最多10页,返回新的PDF字节数据 + + Args: + src_pdf_bytes: PDF文件的字节数据 + + Returns: + bytes: 提取页面后的PDF字节数据 + """ + + # 从字节数据加载PDF + pdf = pdfium.PdfDocument(src_pdf_bytes) + + # 获取PDF页数 + total_page = len(pdf) + if total_page == 0: + # 如果PDF没有页面,直接返回空文档 + logger.warning("PDF is empty, return empty document") + return b'' + + # 选择最多10页 + select_page_cnt = min(10, total_page) + + # 从总页数中随机选择页面 + page_indices = np.random.choice(total_page, select_page_cnt, replace=False).tolist() + + # 创建一个新的PDF文档 + sample_docs = pdfium.PdfDocument.new() + + try: + # 将选择的页面导入新文档 + sample_docs.import_pages(pdf, page_indices) + + # 将新PDF保存到内存缓冲区 + output_buffer = BytesIO() + sample_docs.save(output_buffer) + + # 获取字节数据 + return output_buffer.getvalue() + except Exception as e: + logger.exception(e) + return b'' # 出错时返回空字节 + + +def detect_invalid_chars(sample_pdf_bytes: bytes) -> bool: + """" + 检测PDF中是否包含非法字符 + """ + '''pdfminer比较慢,需要先随机抽取10页左右的sample''' + # sample_pdf_bytes = extract_pages(src_pdf_bytes) + sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) + laparams = LAParams( + line_overlap=0.5, + char_margin=2.0, + line_margin=0.5, + word_margin=0.1, + boxes_flow=None, + detect_vertical=False, + all_texts=False, + ) + text = extract_text(pdf_file=sample_pdf_file_like_object, laparams=laparams) + text = text.replace("\n", "") + # logger.info(text) + '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' + cid_pattern = re.compile(r'\(cid:\d+\)') + matches = cid_pattern.findall(text) + cid_count = len(matches) + cid_len = sum(len(match) for match in matches) + text_len = len(text) + if text_len == 0: + cid_chars_radio = 0 + else: + cid_chars_radio = cid_count/(cid_count + text_len - cid_len) + # logger.debug(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}") + '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档''' + if cid_chars_radio > 0.05: + return True # 乱码文档 + else: + return False # 正常文档 + + +if __name__ == '__main__': + with open('/Users/myhloli/pdf/luanma2x10.pdf', 'rb') as f: + p_bytes = f.read() + logger.info(f"PDF分类结果: {classify(p_bytes)}") \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/pdf_image_tools.py b/vendor/mineru/mineru/utils/pdf_image_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..36ed43ba7887482fbcbd0adf6bd85232da66e5fc --- /dev/null +++ b/vendor/mineru/mineru/utils/pdf_image_tools.py @@ -0,0 +1,103 @@ +# Copyright (c) Opendatalab. All rights reserved. +from io import BytesIO + +import pypdfium2 as pdfium +from loguru import logger +from PIL import Image + +from mineru.data.data_reader_writer import FileBasedDataWriter +from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image +from .hash_utils import str_sha256 + + +def pdf_page_to_image(page: pdfium.PdfPage, dpi=200) -> dict: + """Convert pdfium.PdfDocument to image, Then convert the image to base64. + + Args: + page (_type_): pdfium.PdfPage + dpi (int, optional): reset the dpi of dpi. Defaults to 200. + + Returns: + dict: {'img_base64': str, 'img_pil': pil_img, 'scale': float } + """ + pil_img, scale = page_to_image(page, dpi=dpi) + img_base64 = image_to_b64str(pil_img) + + image_dict = { + "img_base64": img_base64, + "img_pil": pil_img, + "scale": scale, + } + return image_dict + + +def load_images_from_pdf( + pdf_bytes: bytes, + dpi=200, + start_page_id=0, + end_page_id=None, +): + images_list = [] + pdf_doc = pdfium.PdfDocument(pdf_bytes) + pdf_page_num = len(pdf_doc) + end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1 + if end_page_id > pdf_page_num - 1: + logger.warning("end_page_id is out of range, use images length") + end_page_id = pdf_page_num - 1 + + for index in range(0, pdf_page_num): + if start_page_id <= index <= end_page_id: + page = pdf_doc[index] + image_dict = pdf_page_to_image(page, dpi=dpi) + images_list.append(image_dict) + + return images_list, pdf_doc + + +def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2): + """从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地, + 图片存放在save_path下,文件名是: + {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。""" + + # 拼接文件名 + filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}" + + # 老版本返回不带bucket的路径 + img_path = f"{return_path}_{filename}" if return_path is not None else None + + # 新版本生成平铺路径 + img_hash256_path = f"{str_sha256(img_path)}.jpg" + # img_hash256_path = f'{img_path}.jpg' + + crop_img = get_crop_img(bbox, page_pil_img, scale=scale) + + img_bytes = image_to_bytes(crop_img, image_format="JPEG") + + image_writer.write(img_hash256_path, img_bytes) + return img_hash256_path + + +def get_crop_img(bbox: tuple, pil_img, scale=2): + scale_bbox = ( + int(bbox[0] * scale), + int(bbox[1] * scale), + int(bbox[2] * scale), + int(bbox[3] * scale), + ) + return pil_img.crop(scale_bbox) + + +def images_bytes_to_pdf_bytes(image_bytes): + # 内存缓冲区 + pdf_buffer = BytesIO() + + # 载入并转换所有图像为 RGB 模式 + image = Image.open(BytesIO(image_bytes)).convert("RGB") + + # 第一张图保存为 PDF,其余追加 + image.save(pdf_buffer, format="PDF", save_all=True) + + # 获取 PDF bytes 并重置指针(可选) + pdf_bytes = pdf_buffer.getvalue() + pdf_buffer.close() + return pdf_bytes diff --git a/vendor/mineru/mineru/utils/pdf_reader.py b/vendor/mineru/mineru/utils/pdf_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..a7ddcd3581d43a5e0439a69f59ce2f94e9e89a61 --- /dev/null +++ b/vendor/mineru/mineru/utils/pdf_reader.py @@ -0,0 +1,98 @@ +# Copyright (c) Opendatalab. All rights reserved. +import base64 +from io import BytesIO + +from loguru import logger +from PIL import Image +from pypdfium2 import PdfBitmap, PdfDocument, PdfPage + + +def page_to_image( + page: PdfPage, + dpi: int = 144, # changed from 200 to 144 + max_width_or_height: int = 2560, # changed from 4500 to 2560 +) -> (Image.Image, float): + scale = dpi / 72 + + long_side_length = max(*page.get_size()) + if (long_side_length*scale) > max_width_or_height: + scale = max_width_or_height / long_side_length + + bitmap: PdfBitmap = page.render(scale=scale) # type: ignore + try: + image = bitmap.to_pil() + finally: + try: + bitmap.close() + except Exception: + pass + return image, scale + + +def image_to_bytes( + image: Image.Image, + image_format: str = "PNG", # 也可以用 "JPEG" +) -> bytes: + with BytesIO() as image_buffer: + image.save(image_buffer, format=image_format) + return image_buffer.getvalue() + + +def image_to_b64str( + image: Image.Image, + image_format: str = "PNG", # 也可以用 "JPEG" +) -> str: + image_bytes = image_to_bytes(image, image_format) + return base64.b64encode(image_bytes).decode("utf-8") + + +def pdf_to_images( + pdf: str | bytes | PdfDocument, + dpi: int = 144, + max_width_or_height: int = 2560, + start_page_id: int = 0, + end_page_id: int | None = None, +) -> list[Image.Image]: + doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf) + page_num = len(doc) + + end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1 + if end_page_id > page_num - 1: + logger.warning("end_page_id is out of range, use images length") + end_page_id = page_num - 1 + + images = [] + try: + for i in range(start_page_id, end_page_id + 1): + image, _ = page_to_image(doc[i], dpi, max_width_or_height) + images.append(image) + finally: + try: + doc.close() + except Exception: + pass + return images + + +def pdf_to_images_bytes( + pdf: str | bytes | PdfDocument, + dpi: int = 144, + max_width_or_height: int = 2560, + start_page_id: int = 0, + end_page_id: int | None = None, + image_format: str = "PNG", +) -> list[bytes]: + images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id) + return [image_to_bytes(image, image_format) for image in images] + + +def pdf_to_images_b64strs( + pdf: str | bytes | PdfDocument, + dpi: int = 144, + max_width_or_height: int = 2560, + start_page_id: int = 0, + end_page_id: int | None = None, + image_format: str = "PNG", +) -> list[str]: + images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id) + return [image_to_b64str(image, image_format) for image in images] diff --git a/vendor/mineru/mineru/utils/pdf_text_tool.py b/vendor/mineru/mineru/utils/pdf_text_tool.py new file mode 100644 index 0000000000000000000000000000000000000000..8635743990ba25ba4b6da05103574ade3d934b97 --- /dev/null +++ b/vendor/mineru/mineru/utils/pdf_text_tool.py @@ -0,0 +1,40 @@ +from typing import List +import math + +import pypdfium2 as pdfium +from pdftext.pdf.chars import get_chars, deduplicate_chars +from pdftext.pdf.pages import get_spans, get_lines, assign_scripts, get_blocks + + +def get_page( + page: pdfium.PdfPage, + quote_loosebox: bool =True, + superscript_height_threshold: float = 0.7, + line_distance_threshold: float = 0.1, +) -> dict: + + textpage = page.get_textpage() + page_bbox: List[float] = page.get_bbox() + page_width = math.ceil(abs(page_bbox[2] - page_bbox[0])) + page_height = math.ceil(abs(page_bbox[1] - page_bbox[3])) + + page_rotation = 0 + try: + page_rotation = page.get_rotation() + except: + pass + + chars = deduplicate_chars(get_chars(textpage, page_bbox, page_rotation, quote_loosebox)) + spans = get_spans(chars, superscript_height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold) + lines = get_lines(spans) + assign_scripts(lines, height_threshold=superscript_height_threshold, line_distance_threshold=line_distance_threshold) + blocks = get_blocks(lines) + + page = { + "bbox": page_bbox, + "width": page_width, + "height": page_height, + "rotation": page_rotation, + "blocks": blocks + } + return page \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/run_async.py b/vendor/mineru/mineru/utils/run_async.py new file mode 100644 index 0000000000000000000000000000000000000000..7282ec197b9714dcf4991384c6da1513f8a41d41 --- /dev/null +++ b/vendor/mineru/mineru/utils/run_async.py @@ -0,0 +1,52 @@ +import asyncio +import threading +from queue import Queue +from typing import Any, AsyncIterable, Coroutine, Iterable, TypeVar + +T = TypeVar("T") + + +def run_async(coroutine: Coroutine[Any, Any, T]) -> T: + if not asyncio.iscoroutine(coroutine): + raise ValueError("a coroutine was expected, got {!r}".format(coroutine)) + + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = None + + if loop is not None: + return loop.run_until_complete(coroutine) + else: + return asyncio.run(coroutine) + + +def iter_async(iterable: AsyncIterable[T]) -> Iterable[T]: + if not isinstance(iterable, AsyncIterable): + raise ValueError("an async iterable was expected, got {!r}".format(iterable)) + + queue = Queue() + + async def async_helper(): + try: + async for chunk in iterable: + queue.put(chunk) + queue.put(None) + except Exception as e: + queue.put(e) + + def helper(): + run_async(async_helper()) + + thread = threading.Thread(target=helper, daemon=True) + thread.start() + + while True: + chunk = queue.get() + if chunk is None: + break + if isinstance(chunk, Exception): + raise chunk + yield chunk + + thread.join() diff --git a/vendor/mineru/mineru/utils/span_block_fix.py b/vendor/mineru/mineru/utils/span_block_fix.py new file mode 100644 index 0000000000000000000000000000000000000000..44163fa5f3f81b30a0ffbe9262f601666aec7d9b --- /dev/null +++ b/vendor/mineru/mineru/utils/span_block_fix.py @@ -0,0 +1,242 @@ +# Copyright (c) Opendatalab. All rights reserved. +from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio +from mineru.utils.enum_class import BlockType, ContentType +from mineru.utils.ocr_utils import _is_overlaps_y_exceeds_threshold, _is_overlaps_x_exceeds_threshold + +VERTICAL_SPAN_HEIGHT_TO_WIDTH_RATIO_THRESHOLD = 2 +VERTICAL_SPAN_IN_BLOCK_THRESHOLD = 0.8 + +def fill_spans_in_blocks(blocks, spans, radio): + """将allspans中的span按位置关系,放入blocks中.""" + block_with_spans = [] + for block in blocks: + block_type = block[7] + block_bbox = block[0:4] + block_dict = { + 'type': block_type, + 'bbox': block_bbox, + } + if block_type in [ + BlockType.IMAGE_BODY, BlockType.IMAGE_CAPTION, BlockType.IMAGE_FOOTNOTE, + BlockType.TABLE_BODY, BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE + ]: + block_dict['group_id'] = block[-1] + block_spans = [] + for span in spans: + span_bbox = span['bbox'] + if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio and span_block_type_compatible( + span['type'], block_type): + block_spans.append(span) + + block_dict['spans'] = block_spans + block_with_spans.append(block_dict) + + # 从spans删除已经放入block_spans中的span + if len(block_spans) > 0: + for span in block_spans: + spans.remove(span) + + return block_with_spans, spans + + +def span_block_type_compatible(span_type, block_type): + if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]: + return block_type in [ + BlockType.TEXT, + BlockType.TITLE, + BlockType.IMAGE_CAPTION, + BlockType.IMAGE_FOOTNOTE, + BlockType.TABLE_CAPTION, + BlockType.TABLE_FOOTNOTE, + BlockType.DISCARDED + ] + elif span_type == ContentType.INTERLINE_EQUATION: + return block_type in [BlockType.INTERLINE_EQUATION, BlockType.TEXT] + elif span_type == ContentType.IMAGE: + return block_type in [BlockType.IMAGE_BODY] + elif span_type == ContentType.TABLE: + return block_type in [BlockType.TABLE_BODY] + else: + return False + + +def fix_discarded_block(discarded_block_with_spans): + fix_discarded_blocks = [] + for block in discarded_block_with_spans: + block = fix_text_block(block) + fix_discarded_blocks.append(block) + return fix_discarded_blocks + + +def fix_text_block(block): + # 文本block中的公式span都应该转换成行内type + for span in block['spans']: + if span['type'] == ContentType.INTERLINE_EQUATION: + span['type'] = ContentType.INLINE_EQUATION + + # 假设block中的span超过80%的数量高度是宽度的两倍以上,则认为是纵向文本块 + vertical_span_count = sum( + 1 for span in block['spans'] + if (span['bbox'][3] - span['bbox'][1]) / (span['bbox'][2] - span['bbox'][0]) > VERTICAL_SPAN_HEIGHT_TO_WIDTH_RATIO_THRESHOLD + ) + total_span_count = len(block['spans']) + if total_span_count == 0: + vertical_ratio = 0 + else: + vertical_ratio = vertical_span_count / total_span_count + + if vertical_ratio > VERTICAL_SPAN_IN_BLOCK_THRESHOLD: + # 如果是纵向文本块,则按纵向lines处理 + block_lines = merge_spans_to_vertical_line(block['spans']) + sort_block_lines = vertical_line_sort_spans_from_top_to_bottom(block_lines) + else: + block_lines = merge_spans_to_line(block['spans']) + sort_block_lines = line_sort_spans_by_left_to_right(block_lines) + + block['lines'] = sort_block_lines + del block['spans'] + return block + + +def merge_spans_to_line(spans, threshold=0.6): + if len(spans) == 0: + return [] + else: + # 按照y0坐标排序 + spans.sort(key=lambda span: span['bbox'][1]) + + lines = [] + current_line = [spans[0]] + for span in spans[1:]: + # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation" + # image和table类型,同上 + if span['type'] in [ + ContentType.INTERLINE_EQUATION, ContentType.IMAGE, + ContentType.TABLE + ] or any(s['type'] in [ + ContentType.INTERLINE_EQUATION, ContentType.IMAGE, + ContentType.TABLE + ] for s in current_line): + # 则开始新行 + lines.append(current_line) + current_line = [span] + continue + + # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 + if _is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold): + current_line.append(span) + else: + # 否则,开始新行 + lines.append(current_line) + current_line = [span] + + # 添加最后一行 + if current_line: + lines.append(current_line) + + return lines + + +def merge_spans_to_vertical_line(spans, threshold=0.6): + """将纵向文本的spans合并成纵向lines(从右向左阅读)""" + if len(spans) == 0: + return [] + else: + # 按照x2坐标从大到小排序(从右向左) + spans.sort(key=lambda span: span['bbox'][2], reverse=True) + + vertical_lines = [] + current_line = [spans[0]] + + for span in spans[1:]: + # 特殊类型元素单独成列 + if span['type'] in [ + ContentType.INTERLINE_EQUATION, ContentType.IMAGE, + ContentType.TABLE + ] or any(s['type'] in [ + ContentType.INTERLINE_EQUATION, ContentType.IMAGE, + ContentType.TABLE + ] for s in current_line): + vertical_lines.append(current_line) + current_line = [span] + continue + + # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 + if _is_overlaps_x_exceeds_threshold(span['bbox'], current_line[-1]['bbox'], threshold): + current_line.append(span) + else: + vertical_lines.append(current_line) + current_line = [span] + + # 添加最后一列 + if current_line: + vertical_lines.append(current_line) + + return vertical_lines + + +# 将每一个line中的span从左到右排序 +def line_sort_spans_by_left_to_right(lines): + line_objects = [] + for line in lines: + # 按照x0坐标排序 + line.sort(key=lambda span: span['bbox'][0]) + line_bbox = [ + min(span['bbox'][0] for span in line), # x0 + min(span['bbox'][1] for span in line), # y0 + max(span['bbox'][2] for span in line), # x1 + max(span['bbox'][3] for span in line), # y1 + ] + line_objects.append({ + 'bbox': line_bbox, + 'spans': line, + }) + return line_objects + + +def vertical_line_sort_spans_from_top_to_bottom(vertical_lines): + line_objects = [] + for line in vertical_lines: + # 按照y0坐标排序(从上到下) + line.sort(key=lambda span: span['bbox'][1]) + + # 计算整个列的边界框 + line_bbox = [ + min(span['bbox'][0] for span in line), # x0 + min(span['bbox'][1] for span in line), # y0 + max(span['bbox'][2] for span in line), # x1 + max(span['bbox'][3] for span in line), # y1 + ] + + # 组装结果 + line_objects.append({ + 'bbox': line_bbox, + 'spans': line, + }) + return line_objects + + +def fix_block_spans(block_with_spans): + fix_blocks = [] + for block in block_with_spans: + block_type = block['type'] + + if block_type in [BlockType.TEXT, BlockType.TITLE, + BlockType.IMAGE_CAPTION, BlockType.IMAGE_CAPTION, + BlockType.TABLE_CAPTION, BlockType.TABLE_FOOTNOTE + ]: + block = fix_text_block(block) + elif block_type in [BlockType.INTERLINE_EQUATION, BlockType.IMAGE_BODY, BlockType.TABLE_BODY]: + block = fix_interline_block(block) + else: + continue + fix_blocks.append(block) + return fix_blocks + + +def fix_interline_block(block): + block_lines = merge_spans_to_line(block['spans']) + sort_block_lines = line_sort_spans_by_left_to_right(block_lines) + block['lines'] = sort_block_lines + del block['spans'] + return block \ No newline at end of file diff --git a/vendor/mineru/mineru/utils/span_pre_proc.py b/vendor/mineru/mineru/utils/span_pre_proc.py new file mode 100644 index 0000000000000000000000000000000000000000..98c3647775ded5a2e9edd0b176d2873e86ee0392 --- /dev/null +++ b/vendor/mineru/mineru/utils/span_pre_proc.py @@ -0,0 +1,340 @@ +# Copyright (c) Opendatalab. All rights reserved. +import collections +import re +import statistics + +import cv2 +import numpy as np +from loguru import logger + +from mineru.utils.boxbase import calculate_overlap_area_in_bbox1_area_ratio, calculate_iou, \ + get_minbox_if_overlap_by_ratio +from mineru.utils.enum_class import BlockType, ContentType +from mineru.utils.pdf_image_tools import get_crop_img +from mineru.utils.pdf_text_tool import get_page + + +def remove_outside_spans(spans, all_bboxes, all_discarded_blocks): + def get_block_bboxes(blocks, block_type_list): + return [block[0:4] for block in blocks if block[7] in block_type_list] + + image_bboxes = get_block_bboxes(all_bboxes, [BlockType.IMAGE_BODY]) + table_bboxes = get_block_bboxes(all_bboxes, [BlockType.TABLE_BODY]) + other_block_type = [] + for block_type in BlockType.__dict__.values(): + if not isinstance(block_type, str): + continue + if block_type not in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY]: + other_block_type.append(block_type) + other_block_bboxes = get_block_bboxes(all_bboxes, other_block_type) + discarded_block_bboxes = get_block_bboxes(all_discarded_blocks, [BlockType.DISCARDED]) + + new_spans = [] + + for span in spans: + span_bbox = span['bbox'] + span_type = span['type'] + + if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.4 for block_bbox in + discarded_block_bboxes): + new_spans.append(span) + continue + + if span_type == ContentType.IMAGE: + if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in + image_bboxes): + new_spans.append(span) + elif span_type == ContentType.TABLE: + if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in + table_bboxes): + new_spans.append(span) + else: + if any(calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > 0.5 for block_bbox in + other_block_bboxes): + new_spans.append(span) + + return new_spans + + +def remove_overlaps_low_confidence_spans(spans): + dropped_spans = [] + # 删除重叠spans中置信度低的的那些 + for span1 in spans: + for span2 in spans: + if span1 != span2: + # span1 或 span2 任何一个都不应该在 dropped_spans 中 + if span1 in dropped_spans or span2 in dropped_spans: + continue + else: + if calculate_iou(span1['bbox'], span2['bbox']) > 0.9: + if span1['score'] < span2['score']: + span_need_remove = span1 + else: + span_need_remove = span2 + if ( + span_need_remove is not None + and span_need_remove not in dropped_spans + ): + dropped_spans.append(span_need_remove) + + if len(dropped_spans) > 0: + for span_need_remove in dropped_spans: + spans.remove(span_need_remove) + + return spans, dropped_spans + + +def remove_overlaps_min_spans(spans): + dropped_spans = [] + # 删除重叠spans中较小的那些 + for span1 in spans: + for span2 in spans: + if span1 != span2: + # span1 或 span2 任何一个都不应该在 dropped_spans 中 + if span1 in dropped_spans or span2 in dropped_spans: + continue + else: + overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65) + if overlap_box is not None: + span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None) + if span_need_remove is not None and span_need_remove not in dropped_spans: + dropped_spans.append(span_need_remove) + if len(dropped_spans) > 0: + for span_need_remove in dropped_spans: + spans.remove(span_need_remove) + + return spans, dropped_spans + + +def __replace_ligatures(text: str): + ligatures = { + 'fi': 'fi', 'fl': 'fl', 'ff': 'ff', 'ffi': 'ffi', 'ffl': 'ffl', 'ſt': 'ft', 'st': 'st' + } + return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text) + +def __replace_unicode(text: str): + ligatures = { + '\r\n': '', '\u0002': '-', + } + return re.sub('|'.join(map(re.escape, ligatures.keys())), lambda m: ligatures[m.group()], text) + + +"""pdf_text dict方案 char级别""" +def txt_spans_extract(pdf_page, spans, pil_img, scale, all_bboxes, all_discarded_blocks): + + page_dict = get_page(pdf_page) + + page_all_chars = [] + page_all_lines = [] + for block in page_dict['blocks']: + for line in block['lines']: + if 0 < abs(line['rotation']) < 90: + # 旋转角度在0-90度之间的行,直接跳过 + continue + page_all_lines.append(line) + for span in line['spans']: + for char in span['chars']: + page_all_chars.append(char) + + # 计算所有sapn的高度的中位数 + span_height_list = [] + for span in spans: + if span['type'] in [ContentType.TEXT]: + span_height = span['bbox'][3] - span['bbox'][1] + span['height'] = span_height + span['width'] = span['bbox'][2] - span['bbox'][0] + span_height_list.append(span_height) + if len(span_height_list) == 0: + return spans + else: + median_span_height = statistics.median(span_height_list) + + useful_spans = [] + unuseful_spans = [] + # 纵向span的两个特征:1. 高度超过多个line 2. 高宽比超过某个值 + vertical_spans = [] + for span in spans: + if span['type'] in [ContentType.TEXT]: + for block in all_bboxes + all_discarded_blocks: + if block[7] in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.INTERLINE_EQUATION]: + continue + if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5: + if span['height'] > median_span_height * 3 and span['height'] > span['width'] * 3: + vertical_spans.append(span) + elif block in all_bboxes: + useful_spans.append(span) + else: + unuseful_spans.append(span) + break + + """垂直的span框直接用line进行填充""" + if len(vertical_spans) > 0: + for pdfium_line in page_all_lines: + for span in vertical_spans: + if calculate_overlap_area_in_bbox1_area_ratio(pdfium_line['bbox'].bbox, span['bbox']) > 0.5: + for pdfium_span in pdfium_line['spans']: + span['content'] += pdfium_span['text'] + break + + for span in vertical_spans: + if len(span['content']) == 0: + spans.remove(span) + + """水平的span框先用char填充,再用ocr填充空的span框""" + new_spans = [] + + for span in useful_spans + unuseful_spans: + if span['type'] in [ContentType.TEXT]: + span['chars'] = [] + new_spans.append(span) + + need_ocr_spans = fill_char_in_spans(new_spans, page_all_chars, median_span_height) + + """对未填充的span进行ocr""" + if len(need_ocr_spans) > 0: + + for span in need_ocr_spans: + # 对span的bbox截图再ocr + span_pil_img = get_crop_img(span['bbox'], pil_img, scale) + span_img = cv2.cvtColor(np.array(span_pil_img), cv2.COLOR_RGB2BGR) + # 计算span的对比度,低于0.20的span不进行ocr + if calculate_contrast(span_img, img_mode='bgr') <= 0.17: + spans.remove(span) + continue + + span['content'] = '' + span['score'] = 1.0 + span['np_img'] = span_img + + return spans + + +def fill_char_in_spans(spans, all_chars, median_span_height): + # 简单从上到下排一下序 + spans = sorted(spans, key=lambda x: x['bbox'][1]) + + grid_size = median_span_height + grid = collections.defaultdict(list) + for i, span in enumerate(spans): + start_cell = int(span['bbox'][1] / grid_size) + end_cell = int(span['bbox'][3] / grid_size) + for cell_idx in range(start_cell, end_cell + 1): + grid[cell_idx].append(i) + + for char in all_chars: + char_center_y = (char['bbox'][1] + char['bbox'][3]) / 2 + cell_idx = int(char_center_y / grid_size) + + candidate_span_indices = grid.get(cell_idx, []) + + for span_idx in candidate_span_indices: + span = spans[span_idx] + if calculate_char_in_span(char['bbox'], span['bbox'], char['char']): + span['chars'].append(char) + break + + need_ocr_spans = [] + for span in spans: + chars_to_content(span) + # 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤 + if len(span['content']) * span['height'] < span['width'] * 0.5: + # logger.info(f"maybe empty span: {len(span['content'])}, {span['height']}, {span['width']}") + need_ocr_spans.append(span) + del span['height'], span['width'] + return need_ocr_spans + + +LINE_STOP_FLAG = ('.', '!', '?', '。', '!', '?', ')', ')', '"', '”', ':', ':', ';', ';', ']', '】', '}', '}', '>', '》', '、', ',', ',', '-', '—', '–',) +LINE_START_FLAG = ('(', '(', '"', '“', '【', '{', '《', '<', '「', '『', '【', '[',) + +Span_Height_Radio = 0.33 # 字符的中轴和span的中轴高度差不能超过1/3span高度 +def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=Span_Height_Radio): + char_center_x = (char_bbox[0] + char_bbox[2]) / 2 + char_center_y = (char_bbox[1] + char_bbox[3]) / 2 + span_center_y = (span_bbox[1] + span_bbox[3]) / 2 + span_height = span_bbox[3] - span_bbox[1] + + if ( + span_bbox[0] < char_center_x < span_bbox[2] + and span_bbox[1] < char_center_y < span_bbox[3] + and abs(char_center_y - span_center_y) < span_height * span_height_radio # 字符的中轴和span的中轴高度差不能超过Span_Height_Radio + ): + return True + else: + # 如果char是LINE_STOP_FLAG,就不用中心点判定,换一种方案(左边界在span区域内,高度判定和之前逻辑一致) + # 主要是给结尾符号一个进入span的机会,这个char还应该离span右边界较近 + if char in LINE_STOP_FLAG: + if ( + (span_bbox[2] - span_height) < char_bbox[0] < span_bbox[2] + and char_center_x > span_bbox[0] + and span_bbox[1] < char_center_y < span_bbox[3] + and abs(char_center_y - span_center_y) < span_height * span_height_radio + ): + return True + elif char in LINE_START_FLAG: + if ( + span_bbox[0] < char_bbox[2] < (span_bbox[0] + span_height) + and char_center_x < span_bbox[2] + and span_bbox[1] < char_center_y < span_bbox[3] + and abs(char_center_y - span_center_y) < span_height * span_height_radio + ): + return True + else: + return False + + +def chars_to_content(span): + # 检查span中的char是否为空 + if len(span['chars']) == 0: + pass + else: + # 给chars按char_idx排序 + span['chars'] = sorted(span['chars'], key=lambda x: x['char_idx']) + + # Calculate the width of each character + char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']] + # Calculate the median width + median_width = statistics.median(char_widths) + + content = '' + for char in span['chars']: + + # 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度,则需要在中间插入一个空格 + char1 = char + char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None + if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['char'] != ' ' and char2['char'] != ' ': + content += f"{char['char']} " + else: + content += char['char'] + + content = __replace_unicode(content) + content = __replace_ligatures(content) + content = __replace_ligatures(content) + span['content'] = content.strip() + + del span['chars'] + + +def calculate_contrast(img, img_mode) -> float: + """ + 计算给定图像的对比度。 + :param img: 图像,类型为numpy.ndarray + :Param img_mode = 图像的色彩通道,'rgb' 或 'bgr' + :return: 图像的对比度值 + """ + if img_mode == 'rgb': + # 将RGB图像转换为灰度图 + gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + elif img_mode == 'bgr': + # 将BGR图像转换为灰度图 + gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + else: + raise ValueError("Invalid image mode. Please provide 'rgb' or 'bgr'.") + + # 计算均值和标准差 + mean_value = np.mean(gray_img) + std_dev = np.std(gray_img) + # 对比度定义为标准差除以平均值(加上小常数避免除零错误) + contrast = std_dev / (mean_value + 1e-6) + # logger.debug(f"contrast: {contrast}") + return round(contrast, 2) \ No newline at end of file diff --git a/vendor/mineru/mineru/version.py b/vendor/mineru/mineru/version.py new file mode 100644 index 0000000000000000000000000000000000000000..9aa3f90365fde8f020d7d8d42193f97c3d0660e1 --- /dev/null +++ b/vendor/mineru/mineru/version.py @@ -0,0 +1 @@ +__version__ = "2.1.0" diff --git a/vendor/mineru/requirements-qa.txt b/vendor/mineru/requirements-qa.txt new file mode 100644 index 0000000000000000000000000000000000000000..f42f1d38a558fbab037526e879fafd4c0bda251a --- /dev/null +++ b/vendor/mineru/requirements-qa.txt @@ -0,0 +1,20 @@ +pytest +Levenshtein +nltk +rapidfuzz +statistics +openxlab #安装opendatalab +pandas +numpy +matplotlib +seaborn +scipy +scikit-learn +tqdm +htmltabletomd +pypandoc +pyopenssl==24.0.0 +struct-eqtable==0.1.0 +pytest-cov +beautifulsoup4 +coverage \ No newline at end of file