From d243de26ec33477154af83f9f6edbb9b24022a98 Mon Sep 17 00:00:00 2001 From: Renzo <170978465+RenzoMXD@users.noreply.github.com> Date: Thu, 2 Apr 2026 14:34:38 +0200 Subject: [PATCH] refactor: select in metadata_service (#34479) --- api/services/metadata_service.py | 77 ++++++++++++------ .../unit_tests/services/dataset_metadata.py | 78 +++---------------- 2 files changed, 65 insertions(+), 90 deletions(-) diff --git a/api/services/metadata_service.py b/api/services/metadata_service.py index 12729278cc4..672f309bac0 100644 --- a/api/services/metadata_service.py +++ b/api/services/metadata_service.py @@ -1,6 +1,8 @@ import copy import logging +from sqlalchemy import delete, func, select + from core.rag.index_processor.constant.built_in_field import BuiltInField, MetadataDataSource from extensions.ext_database import db from extensions.ext_redis import redis_client @@ -25,10 +27,14 @@ class MetadataService: raise ValueError("Metadata name cannot exceed 255 characters.") current_user, current_tenant_id = current_account_with_tenant() # check if metadata name already exists - if ( - db.session.query(DatasetMetadata) - .filter_by(tenant_id=current_tenant_id, dataset_id=dataset_id, name=metadata_args.name) - .first() + if db.session.scalar( + select(DatasetMetadata) + .where( + DatasetMetadata.tenant_id == current_tenant_id, + DatasetMetadata.dataset_id == dataset_id, + DatasetMetadata.name == metadata_args.name, + ) + .limit(1) ): raise ValueError("Metadata name already exists.") for field in BuiltInField: @@ -54,10 +60,14 @@ class MetadataService: lock_key = f"dataset_metadata_lock_{dataset_id}" # check if metadata name already exists current_user, current_tenant_id = current_account_with_tenant() - if ( - db.session.query(DatasetMetadata) - .filter_by(tenant_id=current_tenant_id, dataset_id=dataset_id, name=name) - .first() + if db.session.scalar( + select(DatasetMetadata) + .where( + DatasetMetadata.tenant_id == current_tenant_id, + DatasetMetadata.dataset_id == dataset_id, + DatasetMetadata.name == name, + ) + .limit(1) ): raise ValueError("Metadata name already exists.") for field in BuiltInField: @@ -65,7 +75,11 @@ class MetadataService: raise ValueError("Metadata name already exists in Built-in fields.") try: MetadataService.knowledge_base_metadata_lock_check(dataset_id, None) - metadata = db.session.query(DatasetMetadata).filter_by(id=metadata_id, dataset_id=dataset_id).first() + metadata = db.session.scalar( + select(DatasetMetadata) + .where(DatasetMetadata.id == metadata_id, DatasetMetadata.dataset_id == dataset_id) + .limit(1) + ) if metadata is None: raise ValueError("Metadata not found.") old_name = metadata.name @@ -74,9 +88,9 @@ class MetadataService: metadata.updated_at = naive_utc_now() # update related documents - dataset_metadata_bindings = ( - db.session.query(DatasetMetadataBinding).filter_by(metadata_id=metadata_id).all() - ) + dataset_metadata_bindings = db.session.scalars( + select(DatasetMetadataBinding).where(DatasetMetadataBinding.metadata_id == metadata_id) + ).all() if dataset_metadata_bindings: document_ids = [binding.document_id for binding in dataset_metadata_bindings] documents = DocumentService.get_document_by_ids(document_ids) @@ -101,15 +115,19 @@ class MetadataService: lock_key = f"dataset_metadata_lock_{dataset_id}" try: MetadataService.knowledge_base_metadata_lock_check(dataset_id, None) - metadata = db.session.query(DatasetMetadata).filter_by(id=metadata_id, dataset_id=dataset_id).first() + metadata = db.session.scalar( + select(DatasetMetadata) + .where(DatasetMetadata.id == metadata_id, DatasetMetadata.dataset_id == dataset_id) + .limit(1) + ) if metadata is None: raise ValueError("Metadata not found.") db.session.delete(metadata) # deal related documents - dataset_metadata_bindings = ( - db.session.query(DatasetMetadataBinding).filter_by(metadata_id=metadata_id).all() - ) + dataset_metadata_bindings = db.session.scalars( + select(DatasetMetadataBinding).where(DatasetMetadataBinding.metadata_id == metadata_id) + ).all() if dataset_metadata_bindings: document_ids = [binding.document_id for binding in dataset_metadata_bindings] documents = DocumentService.get_document_by_ids(document_ids) @@ -224,16 +242,23 @@ class MetadataService: # deal metadata binding (in the same transaction as the doc_metadata update) if not operation.partial_update: - db.session.query(DatasetMetadataBinding).filter_by(document_id=operation.document_id).delete() + db.session.execute( + delete(DatasetMetadataBinding).where( + DatasetMetadataBinding.document_id == operation.document_id + ) + ) current_user, current_tenant_id = current_account_with_tenant() for metadata_value in operation.metadata_list: # check if binding already exists if operation.partial_update: - existing_binding = ( - db.session.query(DatasetMetadataBinding) - .filter_by(document_id=operation.document_id, metadata_id=metadata_value.id) - .first() + existing_binding = db.session.scalar( + select(DatasetMetadataBinding) + .where( + DatasetMetadataBinding.document_id == operation.document_id, + DatasetMetadataBinding.metadata_id == metadata_value.id, + ) + .limit(1) ) if existing_binding: continue @@ -275,9 +300,13 @@ class MetadataService: "id": item.get("id"), "name": item.get("name"), "type": item.get("type"), - "count": db.session.query(DatasetMetadataBinding) - .filter_by(metadata_id=item.get("id"), dataset_id=dataset.id) - .count(), + "count": db.session.scalar( + select(func.count(DatasetMetadataBinding.id)).where( + DatasetMetadataBinding.metadata_id == item.get("id"), + DatasetMetadataBinding.dataset_id == dataset.id, + ) + ) + or 0, } for item in dataset.doc_metadata or [] if item.get("id") != "built-in" diff --git a/api/tests/unit_tests/services/dataset_metadata.py b/api/tests/unit_tests/services/dataset_metadata.py index 5ba18d8dc0f..b825a8686a5 100644 --- a/api/tests/unit_tests/services/dataset_metadata.py +++ b/api/tests/unit_tests/services/dataset_metadata.py @@ -401,10 +401,7 @@ class TestMetadataServiceCreateMetadata: metadata_args = MetadataTestDataFactory.create_metadata_args_mock(name="category", metadata_type="string") # Mock query to return None (no existing metadata with same name) - mock_query = Mock() - mock_query.filter_by.return_value = mock_query - mock_query.first.return_value = None - mock_db_session.query.return_value = mock_query + mock_db_session.scalar.return_value = None # Mock BuiltInField enum iteration with patch("services.metadata_service.BuiltInField") as mock_builtin: @@ -417,10 +414,6 @@ class TestMetadataServiceCreateMetadata: assert result is not None assert isinstance(result, DatasetMetadata) - # Verify query was made to check for duplicates - mock_db_session.query.assert_called() - mock_query.filter_by.assert_called() - # Verify metadata was added and committed mock_db_session.add.assert_called_once() mock_db_session.commit.assert_called_once() @@ -468,10 +461,7 @@ class TestMetadataServiceCreateMetadata: # Mock existing metadata with same name existing_metadata = MetadataTestDataFactory.create_metadata_mock(name="category") - mock_query = Mock() - mock_query.filter_by.return_value = mock_query - mock_query.first.return_value = existing_metadata - mock_db_session.query.return_value = mock_query + mock_db_session.scalar.return_value = existing_metadata # Act & Assert with pytest.raises(ValueError, match="Metadata name already exists"): @@ -500,10 +490,7 @@ class TestMetadataServiceCreateMetadata: ) # Mock query to return None (no duplicate in database) - mock_query = Mock() - mock_query.filter_by.return_value = mock_query - mock_query.first.return_value = None - mock_db_session.query.return_value = mock_query + mock_db_session.scalar.return_value = None # Mock BuiltInField to include the conflicting name with patch("services.metadata_service.BuiltInField") as mock_builtin: @@ -597,27 +584,11 @@ class TestMetadataServiceUpdateMetadataName: existing_metadata = MetadataTestDataFactory.create_metadata_mock(metadata_id=metadata_id, name="category") - # Mock query for duplicate check (no duplicate) - mock_query = Mock() - mock_query.filter_by.return_value = mock_query - mock_query.first.return_value = None - mock_db_session.query.return_value = mock_query - - # Mock metadata retrieval - def query_side_effect(model): - if model == DatasetMetadata: - mock_meta_query = Mock() - mock_meta_query.filter_by.return_value = mock_meta_query - mock_meta_query.first.return_value = existing_metadata - return mock_meta_query - return mock_query - - mock_db_session.query.side_effect = query_side_effect + # Mock scalar calls: first for duplicate check (None), second for metadata retrieval + mock_db_session.scalar.side_effect = [None, existing_metadata] # Mock no metadata bindings (no documents to update) - mock_binding_query = Mock() - mock_binding_query.filter_by.return_value = mock_binding_query - mock_binding_query.all.return_value = [] + mock_db_session.scalars.return_value.all.return_value = [] # Mock BuiltInField enum with patch("services.metadata_service.BuiltInField") as mock_builtin: @@ -655,22 +626,8 @@ class TestMetadataServiceUpdateMetadataName: metadata_id = "non-existent-metadata" new_name = "updated_category" - # Mock query for duplicate check (no duplicate) - mock_query = Mock() - mock_query.filter_by.return_value = mock_query - mock_query.first.return_value = None - mock_db_session.query.return_value = mock_query - - # Mock metadata retrieval to return None - def query_side_effect(model): - if model == DatasetMetadata: - mock_meta_query = Mock() - mock_meta_query.filter_by.return_value = mock_meta_query - mock_meta_query.first.return_value = None # Not found - return mock_meta_query - return mock_query - - mock_db_session.query.side_effect = query_side_effect + # Mock scalar calls: first for duplicate check (None), second for metadata retrieval (None = not found) + mock_db_session.scalar.side_effect = [None, None] # Mock BuiltInField enum with patch("services.metadata_service.BuiltInField") as mock_builtin: @@ -746,15 +703,10 @@ class TestMetadataServiceDeleteMetadata: existing_metadata = MetadataTestDataFactory.create_metadata_mock(metadata_id=metadata_id, name="category") # Mock metadata retrieval - mock_query = Mock() - mock_query.filter_by.return_value = mock_query - mock_query.first.return_value = existing_metadata - mock_db_session.query.return_value = mock_query + mock_db_session.scalar.return_value = existing_metadata # Mock no metadata bindings (no documents to update) - mock_binding_query = Mock() - mock_binding_query.filter_by.return_value = mock_binding_query - mock_binding_query.all.return_value = [] + mock_db_session.scalars.return_value.all.return_value = [] # Act result = MetadataService.delete_metadata(dataset_id, metadata_id) @@ -788,10 +740,7 @@ class TestMetadataServiceDeleteMetadata: metadata_id = "non-existent-metadata" # Mock metadata retrieval to return None - mock_query = Mock() - mock_query.filter_by.return_value = mock_query - mock_query.first.return_value = None - mock_db_session.query.return_value = mock_query + mock_db_session.scalar.return_value = None # Act & Assert with pytest.raises(ValueError, match="Metadata not found"): @@ -1013,10 +962,7 @@ class TestMetadataServiceGetDatasetMetadatas: ) # Mock usage count queries - mock_query = Mock() - mock_query.filter_by.return_value = mock_query - mock_query.count.return_value = 5 # 5 documents use this metadata - mock_db_session.query.return_value = mock_query + mock_db_session.scalar.return_value = 5 # 5 documents use this metadata # Act result = MetadataService.get_dataset_metadatas(dataset)