mirror of
https://github.com/langgenius/dify.git
synced 2026-04-05 19:52:03 +08:00
perf: use batch delete method instead of single delete (#32036)
Co-authored-by: fatelei <fatelei@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: FFXN <lizy@dify.ai>
This commit is contained in:
@@ -14,6 +14,9 @@ from models.model import UploadFile
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Batch size for database operations to keep transactions short
|
||||||
|
BATCH_SIZE = 1000
|
||||||
|
|
||||||
|
|
||||||
@shared_task(queue="dataset")
|
@shared_task(queue="dataset")
|
||||||
def batch_clean_document_task(document_ids: list[str], dataset_id: str, doc_form: str | None, file_ids: list[str]):
|
def batch_clean_document_task(document_ids: list[str], dataset_id: str, doc_form: str | None, file_ids: list[str]):
|
||||||
@@ -31,63 +34,179 @@ def batch_clean_document_task(document_ids: list[str], dataset_id: str, doc_form
|
|||||||
if not doc_form:
|
if not doc_form:
|
||||||
raise ValueError("doc_form is required")
|
raise ValueError("doc_form is required")
|
||||||
|
|
||||||
with session_factory.create_session() as session:
|
storage_keys_to_delete: list[str] = []
|
||||||
try:
|
index_node_ids: list[str] = []
|
||||||
dataset = session.query(Dataset).where(Dataset.id == dataset_id).first()
|
segment_ids: list[str] = []
|
||||||
|
total_image_upload_file_ids: list[str] = []
|
||||||
if not dataset:
|
|
||||||
raise Exception("Document has no dataset")
|
|
||||||
|
|
||||||
session.query(DatasetMetadataBinding).where(
|
|
||||||
DatasetMetadataBinding.dataset_id == dataset_id,
|
|
||||||
DatasetMetadataBinding.document_id.in_(document_ids),
|
|
||||||
).delete(synchronize_session=False)
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
# ============ Step 1: Query segment and file data (short read-only transaction) ============
|
||||||
|
with session_factory.create_session() as session:
|
||||||
|
# Get segments info
|
||||||
segments = session.scalars(
|
segments = session.scalars(
|
||||||
select(DocumentSegment).where(DocumentSegment.document_id.in_(document_ids))
|
select(DocumentSegment).where(DocumentSegment.document_id.in_(document_ids))
|
||||||
).all()
|
).all()
|
||||||
# check segment is exist
|
|
||||||
if segments:
|
if segments:
|
||||||
index_node_ids = [segment.index_node_id for segment in segments]
|
index_node_ids = [segment.index_node_id for segment in segments]
|
||||||
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
|
segment_ids = [segment.id for segment in segments]
|
||||||
index_processor.clean(
|
|
||||||
dataset, index_node_ids, with_keywords=True, delete_child_chunks=True, delete_summaries=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
# Collect image file IDs from segment content
|
||||||
for segment in segments:
|
for segment in segments:
|
||||||
image_upload_file_ids = get_image_upload_file_ids(segment.content)
|
image_upload_file_ids = get_image_upload_file_ids(segment.content)
|
||||||
image_files = session.query(UploadFile).where(UploadFile.id.in_(image_upload_file_ids)).all()
|
total_image_upload_file_ids.extend(image_upload_file_ids)
|
||||||
for image_file in image_files:
|
|
||||||
try:
|
# Query storage keys for image files
|
||||||
if image_file and image_file.key:
|
if total_image_upload_file_ids:
|
||||||
storage.delete(image_file.key)
|
image_files = session.scalars(
|
||||||
except Exception:
|
select(UploadFile).where(UploadFile.id.in_(total_image_upload_file_ids))
|
||||||
logger.exception(
|
).all()
|
||||||
"Delete image_files failed when storage deleted, \
|
storage_keys_to_delete.extend([f.key for f in image_files if f and f.key])
|
||||||
image_upload_file_is: %s",
|
|
||||||
image_file.id,
|
# Query storage keys for document files
|
||||||
)
|
|
||||||
stmt = delete(UploadFile).where(UploadFile.id.in_(image_upload_file_ids))
|
|
||||||
session.execute(stmt)
|
|
||||||
session.delete(segment)
|
|
||||||
if file_ids:
|
if file_ids:
|
||||||
files = session.scalars(select(UploadFile).where(UploadFile.id.in_(file_ids))).all()
|
files = session.scalars(select(UploadFile).where(UploadFile.id.in_(file_ids))).all()
|
||||||
for file in files:
|
storage_keys_to_delete.extend([f.key for f in files if f and f.key])
|
||||||
try:
|
|
||||||
storage.delete(file.key)
|
|
||||||
except Exception:
|
|
||||||
logger.exception("Delete file failed when document deleted, file_id: %s", file.id)
|
|
||||||
stmt = delete(UploadFile).where(UploadFile.id.in_(file_ids))
|
|
||||||
session.execute(stmt)
|
|
||||||
|
|
||||||
session.commit()
|
# ============ Step 2: Clean vector index (external service, fresh session for dataset) ============
|
||||||
|
if index_node_ids:
|
||||||
end_at = time.perf_counter()
|
try:
|
||||||
logger.info(
|
# Fetch dataset in a fresh session to avoid DetachedInstanceError
|
||||||
click.style(
|
with session_factory.create_session() as session:
|
||||||
f"Cleaned documents when documents deleted latency: {end_at - start_at}",
|
dataset = session.query(Dataset).where(Dataset.id == dataset_id).first()
|
||||||
fg="green",
|
if not dataset:
|
||||||
|
logger.warning("Dataset not found for vector index cleanup, dataset_id: %s", dataset_id)
|
||||||
|
else:
|
||||||
|
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
|
||||||
|
index_processor.clean(
|
||||||
|
dataset, index_node_ids, with_keywords=True, delete_child_chunks=True, delete_summaries=True
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception(
|
||||||
|
"Failed to clean vector index for dataset_id: %s, document_ids: %s, index_node_ids count: %d",
|
||||||
|
dataset_id,
|
||||||
|
document_ids,
|
||||||
|
len(index_node_ids),
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
# ============ Step 3: Delete metadata binding (separate short transaction) ============
|
||||||
|
try:
|
||||||
|
with session_factory.create_session() as session:
|
||||||
|
deleted_count = (
|
||||||
|
session.query(DatasetMetadataBinding)
|
||||||
|
.where(
|
||||||
|
DatasetMetadataBinding.dataset_id == dataset_id,
|
||||||
|
DatasetMetadataBinding.document_id.in_(document_ids),
|
||||||
|
)
|
||||||
|
.delete(synchronize_session=False)
|
||||||
|
)
|
||||||
|
session.commit()
|
||||||
|
logger.debug("Deleted %d metadata bindings for dataset_id: %s", deleted_count, dataset_id)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Cleaned documents when documents deleted failed")
|
logger.exception(
|
||||||
|
"Failed to delete metadata bindings for dataset_id: %s, document_ids: %s",
|
||||||
|
dataset_id,
|
||||||
|
document_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============ Step 4: Batch delete UploadFile records (multiple short transactions) ============
|
||||||
|
if total_image_upload_file_ids:
|
||||||
|
failed_batches = 0
|
||||||
|
total_batches = (len(total_image_upload_file_ids) + BATCH_SIZE - 1) // BATCH_SIZE
|
||||||
|
for i in range(0, len(total_image_upload_file_ids), BATCH_SIZE):
|
||||||
|
batch = total_image_upload_file_ids[i : i + BATCH_SIZE]
|
||||||
|
try:
|
||||||
|
with session_factory.create_session() as session:
|
||||||
|
stmt = delete(UploadFile).where(UploadFile.id.in_(batch))
|
||||||
|
session.execute(stmt)
|
||||||
|
session.commit()
|
||||||
|
except Exception:
|
||||||
|
failed_batches += 1
|
||||||
|
logger.exception(
|
||||||
|
"Failed to delete image UploadFile batch %d-%d for dataset_id: %s",
|
||||||
|
i,
|
||||||
|
i + len(batch),
|
||||||
|
dataset_id,
|
||||||
|
)
|
||||||
|
if failed_batches > 0:
|
||||||
|
logger.warning(
|
||||||
|
"Image UploadFile deletion: %d/%d batches failed for dataset_id: %s",
|
||||||
|
failed_batches,
|
||||||
|
total_batches,
|
||||||
|
dataset_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============ Step 5: Batch delete DocumentSegment records (multiple short transactions) ============
|
||||||
|
if segment_ids:
|
||||||
|
failed_batches = 0
|
||||||
|
total_batches = (len(segment_ids) + BATCH_SIZE - 1) // BATCH_SIZE
|
||||||
|
for i in range(0, len(segment_ids), BATCH_SIZE):
|
||||||
|
batch = segment_ids[i : i + BATCH_SIZE]
|
||||||
|
try:
|
||||||
|
with session_factory.create_session() as session:
|
||||||
|
segment_delete_stmt = delete(DocumentSegment).where(DocumentSegment.id.in_(batch))
|
||||||
|
session.execute(segment_delete_stmt)
|
||||||
|
session.commit()
|
||||||
|
except Exception:
|
||||||
|
failed_batches += 1
|
||||||
|
logger.exception(
|
||||||
|
"Failed to delete DocumentSegment batch %d-%d for dataset_id: %s, document_ids: %s",
|
||||||
|
i,
|
||||||
|
i + len(batch),
|
||||||
|
dataset_id,
|
||||||
|
document_ids,
|
||||||
|
)
|
||||||
|
if failed_batches > 0:
|
||||||
|
logger.warning(
|
||||||
|
"DocumentSegment deletion: %d/%d batches failed, document_ids: %s",
|
||||||
|
failed_batches,
|
||||||
|
total_batches,
|
||||||
|
document_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============ Step 6: Delete document-associated files (separate short transaction) ============
|
||||||
|
if file_ids:
|
||||||
|
try:
|
||||||
|
with session_factory.create_session() as session:
|
||||||
|
stmt = delete(UploadFile).where(UploadFile.id.in_(file_ids))
|
||||||
|
session.execute(stmt)
|
||||||
|
session.commit()
|
||||||
|
except Exception:
|
||||||
|
logger.exception(
|
||||||
|
"Failed to delete document UploadFile records for dataset_id: %s, file_ids: %s",
|
||||||
|
dataset_id,
|
||||||
|
file_ids,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ============ Step 7: Delete storage files (I/O operations, no DB transaction) ============
|
||||||
|
storage_delete_failures = 0
|
||||||
|
for storage_key in storage_keys_to_delete:
|
||||||
|
try:
|
||||||
|
storage.delete(storage_key)
|
||||||
|
except Exception:
|
||||||
|
storage_delete_failures += 1
|
||||||
|
logger.exception("Failed to delete file from storage, key: %s", storage_key)
|
||||||
|
if storage_delete_failures > 0:
|
||||||
|
logger.warning(
|
||||||
|
"Storage file deletion completed with %d failures out of %d total files for dataset_id: %s",
|
||||||
|
storage_delete_failures,
|
||||||
|
len(storage_keys_to_delete),
|
||||||
|
dataset_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
end_at = time.perf_counter()
|
||||||
|
logger.info(
|
||||||
|
click.style(
|
||||||
|
f"Cleaned documents when documents deleted latency: {end_at - start_at:.2f}s, "
|
||||||
|
f"dataset_id: {dataset_id}, document_ids: {document_ids}, "
|
||||||
|
f"segments: {len(segment_ids)}, image_files: {len(total_image_upload_file_ids)}, "
|
||||||
|
f"storage_files: {len(storage_keys_to_delete)}",
|
||||||
|
fg="green",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.exception(
|
||||||
|
"Batch clean documents failed for dataset_id: %s, document_ids: %s",
|
||||||
|
dataset_id,
|
||||||
|
document_ids,
|
||||||
|
)
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ import time
|
|||||||
|
|
||||||
import click
|
import click
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
|
from sqlalchemy import delete
|
||||||
|
|
||||||
from core.db.session_factory import session_factory
|
from core.db.session_factory import session_factory
|
||||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||||
@@ -67,8 +68,14 @@ def delete_segment_from_index_task(
|
|||||||
if segment_attachment_bindings:
|
if segment_attachment_bindings:
|
||||||
attachment_ids = [binding.attachment_id for binding in segment_attachment_bindings]
|
attachment_ids = [binding.attachment_id for binding in segment_attachment_bindings]
|
||||||
index_processor.clean(dataset=dataset, node_ids=attachment_ids, with_keywords=False)
|
index_processor.clean(dataset=dataset, node_ids=attachment_ids, with_keywords=False)
|
||||||
for binding in segment_attachment_bindings:
|
segment_attachment_bind_ids = [i.id for i in segment_attachment_bindings]
|
||||||
session.delete(binding)
|
|
||||||
|
for i in range(0, len(segment_attachment_bind_ids), 1000):
|
||||||
|
segment_attachment_bind_delete_stmt = delete(SegmentAttachmentBinding).where(
|
||||||
|
SegmentAttachmentBinding.id.in_(segment_attachment_bind_ids[i : i + 1000])
|
||||||
|
)
|
||||||
|
session.execute(segment_attachment_bind_delete_stmt)
|
||||||
|
|
||||||
# delete upload file
|
# delete upload file
|
||||||
session.query(UploadFile).where(UploadFile.id.in_(attachment_ids)).delete(synchronize_session=False)
|
session.query(UploadFile).where(UploadFile.id.in_(attachment_ids)).delete(synchronize_session=False)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
|
|||||||
logger.info(click.style(f"Start sync document: {document_id}", fg="green"))
|
logger.info(click.style(f"Start sync document: {document_id}", fg="green"))
|
||||||
start_at = time.perf_counter()
|
start_at = time.perf_counter()
|
||||||
|
|
||||||
with session_factory.create_session() as session:
|
with session_factory.create_session() as session, session.begin():
|
||||||
document = session.query(Document).where(Document.id == document_id, Document.dataset_id == dataset_id).first()
|
document = session.query(Document).where(Document.id == document_id, Document.dataset_id == dataset_id).first()
|
||||||
|
|
||||||
if not document:
|
if not document:
|
||||||
@@ -68,7 +68,6 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
|
|||||||
document.indexing_status = "error"
|
document.indexing_status = "error"
|
||||||
document.error = "Datasource credential not found. Please reconnect your Notion workspace."
|
document.error = "Datasource credential not found. Please reconnect your Notion workspace."
|
||||||
document.stopped_at = naive_utc_now()
|
document.stopped_at = naive_utc_now()
|
||||||
session.commit()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
loader = NotionExtractor(
|
loader = NotionExtractor(
|
||||||
@@ -85,7 +84,6 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
|
|||||||
if last_edited_time != page_edited_time:
|
if last_edited_time != page_edited_time:
|
||||||
document.indexing_status = "parsing"
|
document.indexing_status = "parsing"
|
||||||
document.processing_started_at = naive_utc_now()
|
document.processing_started_at = naive_utc_now()
|
||||||
session.commit()
|
|
||||||
|
|
||||||
# delete all document segment and index
|
# delete all document segment and index
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -114,6 +114,21 @@ def mock_db_session():
|
|||||||
session = MagicMock()
|
session = MagicMock()
|
||||||
# Ensure tests can observe session.close() via context manager teardown
|
# Ensure tests can observe session.close() via context manager teardown
|
||||||
session.close = MagicMock()
|
session.close = MagicMock()
|
||||||
|
session.commit = MagicMock()
|
||||||
|
|
||||||
|
# Mock session.begin() context manager to auto-commit on exit
|
||||||
|
begin_cm = MagicMock()
|
||||||
|
begin_cm.__enter__.return_value = session
|
||||||
|
|
||||||
|
def _begin_exit_side_effect(*args, **kwargs):
|
||||||
|
# session.begin().__exit__() should commit if no exception
|
||||||
|
if args[0] is None: # No exception
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
begin_cm.__exit__.side_effect = _begin_exit_side_effect
|
||||||
|
session.begin.return_value = begin_cm
|
||||||
|
|
||||||
|
# Mock create_session() context manager
|
||||||
cm = MagicMock()
|
cm = MagicMock()
|
||||||
cm.__enter__.return_value = session
|
cm.__enter__.return_value = session
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user