refactor: reuse redis connection instead of create new one (#32678)

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
2026-04-05 09:49:25 +08:00 · 2026-03-09 15:53:21 +08:00
parent cbb19cce39
commit 9970f4449a
10 changed files with 1360 additions and 112 deletions
--- a/api/tasks/document_indexing_task.py
+++ b/api/tasks/document_indexing_task.py
@@ -1,9 +1,10 @@
 import logging
 import time
-from collections.abc import Callable, Sequence
+from collections.abc import Sequence
+from typing import Any, Protocol

 import click
-from celery import shared_task
+from celery import current_app, shared_task

 from configs import dify_config
 from core.db.session_factory import session_factory
@@ -19,6 +20,12 @@ from tasks.generate_summary_index_task import generate_summary_index_task
 logger = logging.getLogger(__name__)


+class CeleryTaskLike(Protocol):
+    def delay(self, *args: Any, **kwargs: Any) -> Any: ...
+
+    def apply_async(self, *args: Any, **kwargs: Any) -> Any: ...
+
+
@shared_task(queue="dataset")
 def document_indexing_task(dataset_id: str, document_ids: list):
    """
@@ -179,8 +186,8 @@ def _document_indexing(dataset_id: str, document_ids: Sequence[str]):


 def _document_indexing_with_tenant_queue(
-    tenant_id: str, dataset_id: str, document_ids: Sequence[str], task_func: Callable[[str, str, Sequence[str]], None]
-):
+    tenant_id: str, dataset_id: str, document_ids: Sequence[str], task_func: CeleryTaskLike
+) -> None:
    try:
        _document_indexing(dataset_id, document_ids)
    except Exception:
@@ -201,16 +208,20 @@ def _document_indexing_with_tenant_queue(
        logger.info("document indexing tenant isolation queue %s next tasks: %s", tenant_id, next_tasks)

        if next_tasks:
-            for next_task in next_tasks:
-                document_task = DocumentTask(**next_task)
-                # Process the next waiting task
-                # Keep the flag set to indicate a task is running
-                tenant_isolated_task_queue.set_task_waiting_time()
-                task_func.delay(  # type: ignore
-                    tenant_id=document_task.tenant_id,
-                    dataset_id=document_task.dataset_id,
-                    document_ids=document_task.document_ids,
-                )
+            with current_app.producer_or_acquire() as producer:  # type: ignore
+                for next_task in next_tasks:
+                    document_task = DocumentTask(**next_task)
+                    # Keep the flag set to indicate a task is running
+                    tenant_isolated_task_queue.set_task_waiting_time()
+                    task_func.apply_async(
+                        kwargs={
+                            "tenant_id": document_task.tenant_id,
+                            "dataset_id": document_task.dataset_id,
+                            "document_ids": document_task.document_ids,
+                        },
+                        producer=producer,
+                    )
+
        else:
            # No more waiting tasks, clear the flag
            tenant_isolated_task_queue.delete_task_key()
--- a/api/tasks/rag_pipeline/rag_pipeline_run_task.py
+++ b/api/tasks/rag_pipeline/rag_pipeline_run_task.py
@@ -3,12 +3,13 @@ import json
 import logging
 import time
 import uuid
-from collections.abc import Mapping
+from collections.abc import Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
+from itertools import islice
 from typing import Any

 import click
-from celery import shared_task  # type: ignore
+from celery import group, shared_task
 from flask import current_app, g
 from sqlalchemy.orm import Session, sessionmaker

@@ -27,6 +28,11 @@ from services.file_service import FileService
 logger = logging.getLogger(__name__)


+def chunked(iterable: Sequence, size: int):
+    it = iter(iterable)
+    return iter(lambda: list(islice(it, size)), [])
+
+
@shared_task(queue="pipeline")
 def rag_pipeline_run_task(
    rag_pipeline_invoke_entities_file_id: str,
@@ -83,16 +89,24 @@ def rag_pipeline_run_task(
        logger.info("rag pipeline tenant isolation queue %s next files: %s", tenant_id, next_file_ids)

        if next_file_ids:
-            for next_file_id in next_file_ids:
-                # Process the next waiting task
-                # Keep the flag set to indicate a task is running
-                tenant_isolated_task_queue.set_task_waiting_time()
-                rag_pipeline_run_task.delay(  # type: ignore
-                    rag_pipeline_invoke_entities_file_id=next_file_id.decode("utf-8")
-                    if isinstance(next_file_id, bytes)
-                    else next_file_id,
-                    tenant_id=tenant_id,
-                )
+            for batch in chunked(next_file_ids, 100):
+                jobs = []
+                for next_file_id in batch:
+                    tenant_isolated_task_queue.set_task_waiting_time()
+
+                    file_id = (
+                        next_file_id.decode("utf-8") if isinstance(next_file_id, (bytes, bytearray)) else next_file_id
+                    )
+
+                    jobs.append(
+                        rag_pipeline_run_task.s(
+                            rag_pipeline_invoke_entities_file_id=file_id,
+                            tenant_id=tenant_id,
+                        )
+                    )
+
+                if jobs:
+                    group(jobs).apply_async()
        else:
            # No more waiting tasks, clear the flag
            tenant_isolated_task_queue.delete_task_key()