Spaces:

bibibi12345
/

vertex

Running

App Files Files Community

bibibi12345 commited on 4 days ago

Commit

0a33ddd

1 Parent(s): 04c79ee

bug fixes

Browse files

Files changed (2) hide show

app/api_helpers.py +84 -28
app/openai_handler.py +49 -21

app/api_helpers.py CHANGED Viewed

@@ -34,6 +34,7 @@ class StreamingReasoningProcessor:
         self.tag_buffer = ""
         self.inside_tag = False
         self.reasoning_buffer = ""
     def process_chunk(self, content: str) -> tuple[str, str]:
         """
@@ -45,9 +46,14 @@ class StreamingReasoningProcessor:
         Returns:
             A tuple of:
             - processed_content: Content with reasoning tags removed
-            - current_reasoning: Complete reasoning text if a closing tag was found
         """
-        # Add new content to buffer
         self.tag_buffer += content
         processed_content = ""
@@ -58,12 +64,27 @@ class StreamingReasoningProcessor:
                 # Look for opening tag
                 open_pos = self.tag_buffer.find(self.open_tag)
                 if open_pos == -1:
-                    # No opening tag found
-                    if len(self.tag_buffer) >= len(self.open_tag):
-                        # Safe to output all but the last few chars (in case tag is split)
-                        safe_length = len(self.tag_buffer) - len(self.open_tag) + 1
-                        processed_content += self.tag_buffer[:safe_length]
-                        self.tag_buffer = self.tag_buffer[safe_length:]
                     break
                 else:
                     # Found opening tag
@@ -74,18 +95,40 @@ class StreamingReasoningProcessor:
                 # Inside tag, look for closing tag
                 close_pos = self.tag_buffer.find(self.close_tag)
                 if close_pos == -1:
-                    # No closing tag yet
-                    if len(self.tag_buffer) >= len(self.close_tag):
-                        # Safe to add to reasoning buffer
-                        safe_length = len(self.tag_buffer) - len(self.close_tag) + 1
-                        self.reasoning_buffer += self.tag_buffer[:safe_length]
-                        self.tag_buffer = self.tag_buffer[safe_length:]
                     break
                 else:
                     # Found closing tag
-                    self.reasoning_buffer += self.tag_buffer[:close_pos]
-                    current_reasoning = self.reasoning_buffer
-                    self.reasoning_buffer = ""
                     self.tag_buffer = self.tag_buffer[close_pos + len(self.close_tag):]
                     self.inside_tag = False
@@ -103,17 +146,30 @@ class StreamingReasoningProcessor:
         remaining_content = ""
         remaining_reasoning = ""
-        if self.tag_buffer and not self.inside_tag:
-            # If we have buffered content and we're not inside a tag,
-            # it's safe to output all of it
-            remaining_content = self.tag_buffer
-            self.tag_buffer = ""
-        elif self.inside_tag:
-            # If we're inside a tag when the stream ends, we have an unclosed tag
-            # Return the partial content as regular content (including the opening tag)
-            remaining_content = f"<{self.tag_name}>{self.reasoning_buffer}{self.tag_buffer}"
-            self.reasoning_buffer = ""
-            self.tag_buffer = ""
             self.inside_tag = False
         return remaining_content, remaining_reasoning

         self.tag_buffer = ""
         self.inside_tag = False
         self.reasoning_buffer = ""
+        self.partial_tag_buffer = ""  # Buffer for potential partial tags
     def process_chunk(self, content: str) -> tuple[str, str]:
         """
         Returns:
             A tuple of:
             - processed_content: Content with reasoning tags removed
+            - current_reasoning: Reasoning text found in this chunk (partial or complete)
         """
+        # Add new content to buffer, but also handle any partial tag from before
+        if self.partial_tag_buffer:
+            # We had a partial tag from the previous chunk
+            content = self.partial_tag_buffer + content
+            self.partial_tag_buffer = ""
         self.tag_buffer += content
         processed_content = ""
                 # Look for opening tag
                 open_pos = self.tag_buffer.find(self.open_tag)
                 if open_pos == -1:
+                    # No complete opening tag found
+                    # Check if we might have a partial tag at the end
+                    partial_match = False
+                    for i in range(1, min(len(self.open_tag), len(self.tag_buffer) + 1)):
+                        if self.tag_buffer[-i:] == self.open_tag[:i]:
+                            partial_match = True
+                            # Output everything except the potential partial tag
+                            if len(self.tag_buffer) > i:
+                                processed_content += self.tag_buffer[:-i]
+                                self.partial_tag_buffer = self.tag_buffer[-i:]
+                                self.tag_buffer = ""
+                            else:
+                                # Entire buffer is partial tag
+                                self.partial_tag_buffer = self.tag_buffer
+                                self.tag_buffer = ""
+                            break
+                    if not partial_match:
+                        # No partial tag, output everything
+                        processed_content += self.tag_buffer
+                        self.tag_buffer = ""
                     break
                 else:
                     # Found opening tag
                 # Inside tag, look for closing tag
                 close_pos = self.tag_buffer.find(self.close_tag)
                 if close_pos == -1:
+                    # No complete closing tag yet
+                    # Check for partial closing tag
+                    partial_match = False
+                    for i in range(1, min(len(self.close_tag), len(self.tag_buffer) + 1)):
+                        if self.tag_buffer[-i:] == self.close_tag[:i]:
+                            partial_match = True
+                            # Add everything except potential partial tag to reasoning
+                            if len(self.tag_buffer) > i:
+                                new_reasoning = self.tag_buffer[:-i]
+                                self.reasoning_buffer += new_reasoning
+                                if new_reasoning:  # Stream reasoning as it arrives
+                                    current_reasoning = new_reasoning
+                                self.partial_tag_buffer = self.tag_buffer[-i:]
+                                self.tag_buffer = ""
+                            else:
+                                # Entire buffer is partial tag
+                                self.partial_tag_buffer = self.tag_buffer
+                                self.tag_buffer = ""
+                            break
+                    if not partial_match:
+                        # No partial tag, add all to reasoning and stream it
+                        if self.tag_buffer:
+                            self.reasoning_buffer += self.tag_buffer
+                            current_reasoning = self.tag_buffer
+                            self.tag_buffer = ""
                     break
                 else:
                     # Found closing tag
+                    final_reasoning_chunk = self.tag_buffer[:close_pos]
+                    self.reasoning_buffer += final_reasoning_chunk
+                    if final_reasoning_chunk:  # Include the last chunk of reasoning
+                        current_reasoning = final_reasoning_chunk
+                    self.reasoning_buffer = ""  # Clear buffer after complete tag
                     self.tag_buffer = self.tag_buffer[close_pos + len(self.close_tag):]
                     self.inside_tag = False
         remaining_content = ""
         remaining_reasoning = ""
+        # First handle any partial tag buffer
+        if self.partial_tag_buffer:
+            # The partial tag wasn't completed, so treat it as regular content
+            remaining_content += self.partial_tag_buffer
+            self.partial_tag_buffer = ""
+        if not self.inside_tag:
+            # If we're not inside a tag, output any remaining buffer
+            if self.tag_buffer:
+                remaining_content += self.tag_buffer
+                self.tag_buffer = ""
+        else:
+            # If we're inside a tag when stream ends, we have incomplete reasoning
+            # First, yield any reasoning we've accumulated
+            if self.reasoning_buffer:
+                remaining_reasoning = self.reasoning_buffer
+                self.reasoning_buffer = ""
+            # Then output the remaining buffer as content (it's an incomplete tag)
+            if self.tag_buffer:
+                # Don't include the opening tag in output - just the buffer content
+                remaining_content += self.tag_buffer
+                self.tag_buffer = ""
             self.inside_tag = False
         return remaining_content, remaining_reasoning

app/openai_handler.py CHANGED Viewed

@@ -121,6 +121,7 @@ class OpenAIDirectHandler:
             # Create processor for tag-based extraction across chunks
             reasoning_processor = StreamingReasoningProcessor(VERTEX_REASONING_TAG)
             chunk_count = 0
             async for chunk in stream_response:
                 chunk_count += 1
@@ -145,20 +146,36 @@ class OpenAIDirectHandler:
                                 if processed_content or current_reasoning:
                                     print(f"DEBUG: Chunk {chunk_count} - Processed content: '{processed_content}', Reasoning: '{current_reasoning[:50]}...' if len(current_reasoning) > 50 else '{current_reasoning}'")
-                                # Update delta with processed content
                                 if current_reasoning:
-                                    delta['reasoning_content'] = current_reasoning
                                 if processed_content:
-                                    delta['content'] = processed_content
-                                elif 'content' in delta:
-                                    del delta['content']
-                    yield f"data: {json.dumps(chunk_as_dict)}\n\n"
                 except Exception as chunk_error:
                     error_msg = f"Error processing OpenAI chunk for {request.model}: {str(chunk_error)}"
                     print(f"ERROR: {error_msg}")
-                    if len(error_msg) > 1024:
                         error_msg = error_msg[:1024] + "..."
                     error_response = create_openai_error_response(500, error_msg, "server_error")
                     yield f"data: {json.dumps(error_response)}\n\n"
@@ -173,35 +190,46 @@ class OpenAIDirectHandler:
             # Flush any remaining buffered content
             remaining_content, remaining_reasoning = reasoning_processor.flush_remaining()
-            if remaining_content:
-                print(f"DEBUG: Flushing remaining content: '{remaining_content}'")
-                final_chunk = {
                     "id": f"chatcmpl-{int(time.time())}",
                     "object": "chat.completion.chunk",
                     "created": int(time.time()),
                     "model": request.model,
-                    "choices": [{"index": 0, "delta": {"content": remaining_content}, "finish_reason": None}]
                 }
-                yield f"data: {json.dumps(final_chunk)}\n\n"
-                # Send a proper finish reason chunk
-                finish_chunk = {
                     "id": f"chatcmpl-{int(time.time())}",
                     "object": "chat.completion.chunk",
                     "created": int(time.time()),
                     "model": request.model,
-                    "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
                 }
-                yield f"data: {json.dumps(finish_chunk)}\n\n"
-            # Note: remaining_reasoning is not used here since incomplete reasoning
-            # is treated as regular content when tags are unclosed
             yield "data: [DONE]\n\n"
         except Exception as stream_error:
             error_msg = str(stream_error)
-            if len(error_msg) > 1024:
                 error_msg = error_msg[:1024] + "..."
             error_msg_full = f"Error during OpenAI streaming for {request.model}: {error_msg}"
             print(f"ERROR: {error_msg_full}")

             # Create processor for tag-based extraction across chunks
             reasoning_processor = StreamingReasoningProcessor(VERTEX_REASONING_TAG)
             chunk_count = 0
+            has_sent_content = False
             async for chunk in stream_response:
                 chunk_count += 1
                                 if processed_content or current_reasoning:
                                     print(f"DEBUG: Chunk {chunk_count} - Processed content: '{processed_content}', Reasoning: '{current_reasoning[:50]}...' if len(current_reasoning) > 50 else '{current_reasoning}'")
+                                # Send chunks for both reasoning and content as they arrive
+                                chunks_to_send = []
+                                # If we have reasoning content, send it
                                 if current_reasoning:
+                                    reasoning_chunk = chunk_as_dict.copy()
+                                    reasoning_chunk['choices'][0]['delta'] = {'reasoning_content': current_reasoning}
+                                    chunks_to_send.append(reasoning_chunk)
+                                # If we have regular content, send it
                                 if processed_content:
+                                    content_chunk = chunk_as_dict.copy()
+                                    content_chunk['choices'][0]['delta'] = {'content': processed_content}
+                                    chunks_to_send.append(content_chunk)
+                                    has_sent_content = True
+                                # Send all chunks
+                                for chunk_to_send in chunks_to_send:
+                                    yield f"data: {json.dumps(chunk_to_send)}\n\n"
+                            else:
+                                # Still yield the chunk even if no content (could have other delta fields)
+                                yield f"data: {json.dumps(chunk_as_dict)}\n\n"
+                    else:
+                        # Yield chunks without choices too (they might contain metadata)
+                        yield f"data: {json.dumps(chunk_as_dict)}\n\n"
                 except Exception as chunk_error:
                     error_msg = f"Error processing OpenAI chunk for {request.model}: {str(chunk_error)}"
                     print(f"ERROR: {error_msg}")
+                    if len(error_msg) > 1024:
                         error_msg = error_msg[:1024] + "..."
                     error_response = create_openai_error_response(500, error_msg, "server_error")
                     yield f"data: {json.dumps(error_response)}\n\n"
             # Flush any remaining buffered content
             remaining_content, remaining_reasoning = reasoning_processor.flush_remaining()
+            # Send any remaining reasoning first
+            if remaining_reasoning:
+                print(f"DEBUG: Flushing remaining reasoning: '{remaining_reasoning[:50]}...' if len(remaining_reasoning) > 50 else '{remaining_reasoning}'")
+                reasoning_chunk = {
                     "id": f"chatcmpl-{int(time.time())}",
                     "object": "chat.completion.chunk",
                     "created": int(time.time()),
                     "model": request.model,
+                    "choices": [{"index": 0, "delta": {"reasoning_content": remaining_reasoning}, "finish_reason": None}]
                 }
+                yield f"data: {json.dumps(reasoning_chunk)}\n\n"
+            # Send any remaining content
+            if remaining_content:
+                print(f"DEBUG: Flushing remaining content: '{remaining_content}'")
+                final_chunk = {
                     "id": f"chatcmpl-{int(time.time())}",
                     "object": "chat.completion.chunk",
                     "created": int(time.time()),
                     "model": request.model,
+                    "choices": [{"index": 0, "delta": {"content": remaining_content}, "finish_reason": None}]
                 }
+                yield f"data: {json.dumps(final_chunk)}\n\n"
+                has_sent_content = True
+            # Always send a finish reason chunk
+            finish_chunk = {
+                "id": f"chatcmpl-{int(time.time())}",
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "model": request.model,
+                "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}]
+            }
+            yield f"data: {json.dumps(finish_chunk)}\n\n"
             yield "data: [DONE]\n\n"
         except Exception as stream_error:
             error_msg = str(stream_error)
+            if len(error_msg) > 1024:
                 error_msg = error_msg[:1024] + "..."
             error_msg_full = f"Error during OpenAI streaming for {request.model}: {error_msg}"
             print(f"ERROR: {error_msg_full}")