Spaces:

DroolingPanda
/

teachingAssistant

Build error

teachingAssistant / src /infrastructure /base /translation_provider_base.py

Michael Hu

Revert "lgos"

781eb5f 23 days ago

12 kB

	"""Base class for translation provider implementations."""

	import logging
	import re
	from abc import ABC, abstractmethod
	from typing import List, TYPE_CHECKING

	if TYPE_CHECKING:
	from ...domain.models.translation_request import TranslationRequest
	from ...domain.models.text_content import TextContent

	from ...domain.interfaces.translation import ITranslationService
	from ...domain.exceptions import TranslationFailedException

	logger = logging.getLogger(__name__)


	class TranslationProviderBase(ITranslationService, ABC):
	"""Abstract base class for translation provider implementations."""

	def __init__(self, provider_name: str, supported_languages: dict[str, list[str]] = None):
	"""
	Initialize the translation provider.

	Args:
	provider_name: Name of the translation provider
	supported_languages: Dict mapping source languages to supported target languages
	"""
	self.provider_name = provider_name
	self.supported_languages = supported_languages or {}
	self.max_chunk_length = 1000 # Default chunk size for text processing

	def translate(self, request: 'TranslationRequest') -> 'TextContent':
	"""
	Translate text from source language to target language.

	Args:
	request: The translation request

	Returns:
	TextContent: The translated text

	Raises:
	TranslationFailedException: If translation fails
	"""
	try:
	logger.info(f"Starting translation with {self.provider_name} provider")
	logger.info(f"Translating from {request.source_text.language} to {request.target_language}")

	self._validate_request(request)

	# Split text into chunks for processing
	text_chunks = self._chunk_text(request.source_text.text)
	logger.info(f"Split text into {len(text_chunks)} chunks for processing")

	# Translate each chunk
	translated_chunks = []
	for i, chunk in enumerate(text_chunks):
	logger.debug(f"Translating chunk {i+1}/{len(text_chunks)}")
	translated_chunk = self._translate_chunk(
	chunk,
	request.source_text.language,
	request.target_language
	)
	translated_chunks.append(translated_chunk)

	# Reassemble translated text
	translated_text = self._reassemble_chunks(translated_chunks)

	# Create TextContent from translation result
	from ...domain.models.text_content import TextContent

	result = TextContent(
	text=translated_text,
	language=request.target_language,
	encoding='utf-8'
	)

	logger.info(f"Translation completed successfully with {self.provider_name}")
	logger.info(f"Original length: {len(request.source_text.text)}, Translated length: {len(translated_text)}")

	return result

	except Exception as e:
	logger.error(f"Translation failed with {self.provider_name}: {str(e)}")
	raise TranslationFailedException(f"Translation failed: {str(e)}") from e

	@abstractmethod
	def _translate_chunk(self, text: str, source_language: str, target_language: str) -> str:
	"""
	Translate a single chunk of text using provider-specific implementation.

	Args:
	text: The text chunk to translate
	source_language: Source language code
	target_language: Target language code

	Returns:
	str: The translated text chunk
	"""
	pass

	@abstractmethod
	def is_available(self) -> bool:
	"""
	Check if the translation provider is available and ready to use.

	Returns:
	bool: True if provider is available, False otherwise
	"""
	pass

	@abstractmethod
	def get_supported_languages(self) -> dict[str, list[str]]:
	"""
	Get supported language pairs for this provider.

	Returns:
	dict: Mapping of source languages to supported target languages
	"""
	pass

	def _chunk_text(self, text: str) -> List[str]:
	"""
	Split text into chunks for translation processing.

	Args:
	text: The text to chunk

	Returns:
	List[str]: List of text chunks
	"""
	if len(text) <= self.max_chunk_length:
	return [text]

	chunks = []
	current_chunk = ""

	# Split by sentences first to maintain context
	sentences = self._split_into_sentences(text)

	for sentence in sentences:
	# If adding this sentence would exceed chunk limit
	if len(current_chunk) + len(sentence) > self.max_chunk_length:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = ""

	# If single sentence is too long, split by words
	if len(sentence) > self.max_chunk_length:
	word_chunks = self._split_long_sentence(sentence)
	chunks.extend(word_chunks[:-1]) # Add all but last chunk
	current_chunk = word_chunks[-1] # Start new chunk with last piece
	else:
	current_chunk = sentence
	else:
	current_chunk += " " + sentence if current_chunk else sentence

	# Add remaining chunk
	if current_chunk.strip():
	chunks.append(current_chunk.strip())

	logger.debug(f"Text chunked into {len(chunks)} pieces")
	return chunks

	def _split_into_sentences(self, text: str) -> List[str]:
	"""
	Split text into sentences using basic punctuation rules.

	Args:
	text: The text to split

	Returns:
	List[str]: List of sentences
	"""
	# Simple sentence splitting using regex
	# This handles basic cases - more sophisticated NLP libraries could be used
	sentence_endings = r'[.!?]+\s+'
	sentences = re.split(sentence_endings, text)

	# Filter out empty sentences and strip whitespace
	sentences = [s.strip() for s in sentences if s.strip()]

	return sentences

	def _split_long_sentence(self, sentence: str) -> List[str]:
	"""
	Split a long sentence into smaller chunks by words.

	Args:
	sentence: The sentence to split

	Returns:
	List[str]: List of word chunks
	"""
	words = sentence.split()
	chunks = []
	current_chunk = ""

	for word in words:
	if len(current_chunk) + len(word) + 1 > self.max_chunk_length:
	if current_chunk:
	chunks.append(current_chunk.strip())
	current_chunk = word
	else:
	# Single word is too long, just add it
	chunks.append(word)
	else:
	current_chunk += " " + word if current_chunk else word

	if current_chunk.strip():
	chunks.append(current_chunk.strip())

	return chunks

	def _reassemble_chunks(self, chunks: List[str]) -> str:
	"""
	Reassemble translated chunks into a single text.

	Args:
	chunks: List of translated text chunks

	Returns:
	str: Reassembled text
	"""
	# Simple reassembly with space separation
	# More sophisticated approaches could preserve original formatting
	return " ".join(chunk.strip() for chunk in chunks if chunk.strip())

	def _validate_request(self, request: 'TranslationRequest') -> None:
	"""
	Validate the translation request.

	Args:
	request: The translation request to validate

	Raises:
	TranslationFailedException: If request is invalid
	"""
	if not request.source_text.text.strip():
	raise TranslationFailedException("Source text cannot be empty")

	if request.source_text.language == request.target_language:
	raise TranslationFailedException("Source and target languages cannot be the same")

	# Check if language pair is supported
	if self.supported_languages:
	source_lang = request.source_text.language
	target_lang = request.target_language

	if source_lang not in self.supported_languages:
	raise TranslationFailedException(
	f"Source language {source_lang} not supported by {self.provider_name}. "
	f"Supported source languages: {list(self.supported_languages.keys())}"
	)

	if target_lang not in self.supported_languages[source_lang]:
	raise TranslationFailedException(
	f"Translation from {source_lang} to {target_lang} not supported by {self.provider_name}. "
	f"Supported target languages for {source_lang}: {self.supported_languages[source_lang]}"
	)

	def _preprocess_text(self, text: str) -> str:
	"""
	Preprocess text before translation.

	Args:
	text: The text to preprocess

	Returns:
	str: Preprocessed text
	"""
	# Basic text preprocessing
	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)

	# Strip leading/trailing whitespace
	text = text.strip()

	return text

	def _postprocess_text(self, text: str) -> str:
	"""
	Postprocess text after translation.

	Args:
	text: The text to postprocess

	Returns:
	str: Postprocessed text
	"""
	# Basic text postprocessing
	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)

	# Strip leading/trailing whitespace
	text = text.strip()

	# Fix common spacing issues around punctuation
	text = re.sub(r'\s+([.!?,:;])', r'\1', text)
	text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)

	return text

	def _handle_provider_error(self, error: Exception, context: str = "") -> None:
	"""
	Handle provider-specific errors and convert to domain exceptions.

	Args:
	error: The original error
	context: Additional context about when the error occurred
	"""
	error_msg = f"{self.provider_name} error"
	if context:
	error_msg += f" during {context}"
	error_msg += f": {str(error)}"

	logger.error(error_msg, exc_info=True)
	raise TranslationFailedException(error_msg) from error

	def set_chunk_size(self, chunk_size: int) -> None:
	"""
	Set the maximum chunk size for text processing.

	Args:
	chunk_size: Maximum characters per chunk
	"""
	if chunk_size <= 0:
	raise ValueError("Chunk size must be positive")

	self.max_chunk_length = chunk_size
	logger.info(f"Chunk size set to {chunk_size} characters")

	def get_translation_stats(self, request: 'TranslationRequest') -> dict:
	"""
	Get statistics about a translation request.

	Args:
	request: The translation request

	Returns:
	dict: Translation statistics
	"""
	text = request.source_text.text
	chunks = self._chunk_text(text)

	return {
	'provider': self.provider_name,
	'source_language': request.source_text.language,
	'target_language': request.target_language,
	'text_length': len(text),
	'word_count': len(text.split()),
	'chunk_count': len(chunks),
	'max_chunk_length': max(len(chunk) for chunk in chunks) if chunks else 0,
	'avg_chunk_length': sum(len(chunk) for chunk in chunks) / len(chunks) if chunks else 0
	}