| import asyncio | |
| from src.file_handler.handlers import ( | |
| convert_docx_to_markdown, | |
| convert_excel_bytes_to_llm_format, | |
| convert_file_to_string, | |
| convert_image_to_pillow, | |
| convert_pdf_to_markdown, | |
| ) | |
| async def aparse_file(task_id: str, file_name: str, api_base_url: str) -> str: | |
| """ | |
| Parses a file and returns its content in a format suitable for LLMs. | |
| Args: | |
| task_id (str): The ID of the task. | |
| file_name (str): The name of the file. | |
| api_base_url (str): The base URL of the API. | |
| Returns: | |
| str: The content of the file in a format suitable for LLMs. | |
| """ | |
| file_extension = file_name.split(".")[-1] | |
| if file_extension == "xlsx": | |
| return await convert_excel_bytes_to_llm_format(task_id, api_base_url) | |
| elif file_extension == "docx": | |
| return await convert_docx_to_markdown(task_id, api_base_url) | |
| elif file_extension in ["jpg", "jpeg", "png", "gif", "bmp", "tiff", "webp"]: | |
| return await convert_image_to_pillow(task_id, api_base_url) | |
| elif file_extension == "pdf": | |
| return await convert_pdf_to_markdown(task_id, api_base_url) | |
| elif file_extension == "mp3": | |
| return None | |
| else: | |
| return await convert_file_to_string(task_id, api_base_url) | |
| def parse_file(task_id: str, file_name: str, api_base_url: str) -> str: | |
| """ | |
| Parses a file and returns its content in a format suitable for LLMs. | |
| Args: | |
| task_id (str): The ID of the task. | |
| file_name (str): The name of the file. | |
| api_base_url (str): The base URL of the API. | |
| Returns: | |
| str: The content of the file in a format suitable for LLMs. | |
| """ | |
| return asyncio.run(aparse_file(task_id, file_name, api_base_url)) | |