mirror of
https://github.com/langgenius/dify.git
synced 2026-04-05 17:49:23 +08:00
Compare commits
42 Commits
feat/login
...
0.9.1-fix1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c2328cb676 | ||
|
|
36d3221a05 | ||
|
|
40f2e7d821 | ||
|
|
f45042aa8e | ||
|
|
2ab8bc679f | ||
|
|
2571b0c4e3 | ||
|
|
959a81a41b | ||
|
|
4480b469a6 | ||
|
|
fcfa1252a0 | ||
|
|
e1e2d0b364 | ||
|
|
9815a0911b | ||
|
|
dc5839b6bb | ||
|
|
4373777871 | ||
|
|
415d27c8bf | ||
|
|
5366820a2f | ||
|
|
5f8a27074e | ||
|
|
24ba9fdf6c | ||
|
|
824a0dd63e | ||
|
|
c2d606d587 | ||
|
|
2deaece7e2 | ||
|
|
0d84221b2c | ||
|
|
cdd7e55a88 | ||
|
|
1f5cc071f8 | ||
|
|
625e4c4c72 | ||
|
|
7850a28ec8 | ||
|
|
730d3a6d7c | ||
|
|
d6a44e9990 | ||
|
|
3069b5cf57 | ||
|
|
7873e455bb | ||
|
|
a651b73db0 | ||
|
|
d2ce4960f1 | ||
|
|
1af4ca344e | ||
|
|
fa837b2dfd | ||
|
|
824a71388a | ||
|
|
4585cffce1 | ||
|
|
13046709a9 | ||
|
|
9d221a5e19 | ||
|
|
77aef9ff1d | ||
|
|
503561f464 | ||
|
|
ada9d408ac | ||
|
|
3af65b2f45 | ||
|
|
369e1e6f58 |
2
.github/workflows/build-push.yml
vendored
2
.github/workflows/build-push.yml
vendored
@@ -125,7 +125,7 @@ jobs:
|
||||
with:
|
||||
images: ${{ env[matrix.image_name_env] }}
|
||||
tags: |
|
||||
type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/') && !contains(github.ref, '-') }}
|
||||
type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/') && !contains(github.ref, '-beta') }}
|
||||
type=ref,event=branch
|
||||
type=sha,enable=true,priority=100,prefix=,suffix=,format=long
|
||||
type=raw,value=${{ github.ref_name }},enable=${{ startsWith(github.ref, 'refs/tags/') }}
|
||||
|
||||
@@ -68,7 +68,7 @@ DifyはオープンソースのLLMアプリケーション開発プラットフ
|
||||
プロンプトの作成、モデルパフォーマンスの比較が行え、チャットベースのアプリに音声合成などの機能も追加できます。
|
||||
|
||||
**4. RAGパイプライン**:
|
||||
ドキュメントの取り込みから検索までをカバーする広範なRAG機能ができます。ほかにもPDF、PPT、その他の一般的なドキュメントフォーマットからのテキスト抽出のサーポイントも提供します。
|
||||
ドキュメントの取り込みから検索までをカバーする広範なRAG機能ができます。ほかにもPDF、PPT、その他の一般的なドキュメントフォーマットからのテキスト抽出のサポートも提供します。
|
||||
|
||||
**5. エージェント機能**:
|
||||
LLM Function CallingやReActに基づくエージェントの定義が可能で、AIエージェント用のプリビルトまたはカスタムツールを追加できます。Difyには、Google検索、DALL·E、Stable Diffusion、WolframAlphaなどのAIエージェント用の50以上の組み込みツールが提供します。
|
||||
|
||||
@@ -39,7 +39,7 @@ DB_DATABASE=dify
|
||||
|
||||
# Storage configuration
|
||||
# use for store upload files, private keys...
|
||||
# storage type: local, s3, azure-blob, google-storage, tencent-cos, huawei-obs, volcengine-tos
|
||||
# storage type: local, s3, azure-blob, google-storage, tencent-cos, huawei-obs, volcengine-tos, baidu-obs
|
||||
STORAGE_TYPE=local
|
||||
STORAGE_LOCAL_PATH=storage
|
||||
S3_USE_AWS_MANAGED_IAM=false
|
||||
@@ -79,6 +79,12 @@ HUAWEI_OBS_SECRET_KEY=your-secret-key
|
||||
HUAWEI_OBS_ACCESS_KEY=your-access-key
|
||||
HUAWEI_OBS_SERVER=your-server-url
|
||||
|
||||
# Baidu OBS Storage Configuration
|
||||
BAIDU_OBS_BUCKET_NAME=your-bucket-name
|
||||
BAIDU_OBS_SECRET_KEY=your-secret-key
|
||||
BAIDU_OBS_ACCESS_KEY=your-access-key
|
||||
BAIDU_OBS_ENDPOINT=your-server-url
|
||||
|
||||
# OCI Storage configuration
|
||||
OCI_ENDPOINT=your-endpoint
|
||||
OCI_BUCKET_NAME=your-bucket-name
|
||||
|
||||
@@ -8,6 +8,7 @@ from configs.middleware.cache.redis_config import RedisConfig
|
||||
from configs.middleware.storage.aliyun_oss_storage_config import AliyunOSSStorageConfig
|
||||
from configs.middleware.storage.amazon_s3_storage_config import S3StorageConfig
|
||||
from configs.middleware.storage.azure_blob_storage_config import AzureBlobStorageConfig
|
||||
from configs.middleware.storage.baidu_obs_storage_config import BaiduOBSStorageConfig
|
||||
from configs.middleware.storage.google_cloud_storage_config import GoogleCloudStorageConfig
|
||||
from configs.middleware.storage.huawei_obs_storage_config import HuaweiCloudOBSStorageConfig
|
||||
from configs.middleware.storage.oci_storage_config import OCIStorageConfig
|
||||
@@ -200,12 +201,13 @@ class MiddlewareConfig(
|
||||
StorageConfig,
|
||||
AliyunOSSStorageConfig,
|
||||
AzureBlobStorageConfig,
|
||||
BaiduOBSStorageConfig,
|
||||
GoogleCloudStorageConfig,
|
||||
TencentCloudCOSStorageConfig,
|
||||
HuaweiCloudOBSStorageConfig,
|
||||
VolcengineTOSStorageConfig,
|
||||
S3StorageConfig,
|
||||
OCIStorageConfig,
|
||||
S3StorageConfig,
|
||||
TencentCloudCOSStorageConfig,
|
||||
VolcengineTOSStorageConfig,
|
||||
# configs of vdb and vdb providers
|
||||
VectorStoreConfig,
|
||||
AnalyticdbConfig,
|
||||
|
||||
29
api/configs/middleware/storage/baidu_obs_storage_config.py
Normal file
29
api/configs/middleware/storage/baidu_obs_storage_config.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class BaiduOBSStorageConfig(BaseModel):
|
||||
"""
|
||||
Configuration settings for Baidu Object Storage Service (OBS)
|
||||
"""
|
||||
|
||||
BAIDU_OBS_BUCKET_NAME: Optional[str] = Field(
|
||||
description="Name of the Baidu OBS bucket to store and retrieve objects (e.g., 'my-obs-bucket')",
|
||||
default=None,
|
||||
)
|
||||
|
||||
BAIDU_OBS_ACCESS_KEY: Optional[str] = Field(
|
||||
description="Access Key ID for authenticating with Baidu OBS",
|
||||
default=None,
|
||||
)
|
||||
|
||||
BAIDU_OBS_SECRET_KEY: Optional[str] = Field(
|
||||
description="Secret Access Key for authenticating with Baidu OBS",
|
||||
default=None,
|
||||
)
|
||||
|
||||
BAIDU_OBS_ENDPOINT: Optional[str] = Field(
|
||||
description="URL of the Baidu OSS endpoint for your chosen region (e.g., 'https://.bj.bcebos.com')",
|
||||
default=None,
|
||||
)
|
||||
@@ -9,7 +9,7 @@ class PackagingInfo(BaseSettings):
|
||||
|
||||
CURRENT_VERSION: str = Field(
|
||||
description="Dify version",
|
||||
default="0.8.3",
|
||||
default="0.9.1-fix1",
|
||||
)
|
||||
|
||||
COMMIT_SHA: str = Field(
|
||||
|
||||
@@ -37,7 +37,16 @@ from .auth import activate, data_source_bearer_auth, data_source_oauth, forgot_p
|
||||
from .billing import billing
|
||||
|
||||
# Import datasets controllers
|
||||
from .datasets import data_source, datasets, datasets_document, datasets_segments, file, hit_testing, website
|
||||
from .datasets import (
|
||||
data_source,
|
||||
datasets,
|
||||
datasets_document,
|
||||
datasets_segments,
|
||||
external,
|
||||
file,
|
||||
hit_testing,
|
||||
website,
|
||||
)
|
||||
|
||||
# Import explore controllers
|
||||
from .explore import (
|
||||
|
||||
@@ -49,7 +49,7 @@ class DatasetListApi(Resource):
|
||||
page = request.args.get("page", default=1, type=int)
|
||||
limit = request.args.get("limit", default=20, type=int)
|
||||
ids = request.args.getlist("ids")
|
||||
provider = request.args.get("provider", default="vendor")
|
||||
# provider = request.args.get("provider", default="vendor")
|
||||
search = request.args.get("keyword", default=None, type=str)
|
||||
tag_ids = request.args.getlist("tag_ids")
|
||||
|
||||
@@ -57,7 +57,7 @@ class DatasetListApi(Resource):
|
||||
datasets, total = DatasetService.get_datasets_by_ids(ids, current_user.current_tenant_id)
|
||||
else:
|
||||
datasets, total = DatasetService.get_datasets(
|
||||
page, limit, provider, current_user.current_tenant_id, current_user, search, tag_ids
|
||||
page, limit, current_user.current_tenant_id, current_user, search, tag_ids
|
||||
)
|
||||
|
||||
# check embedding setting
|
||||
@@ -110,6 +110,26 @@ class DatasetListApi(Resource):
|
||||
nullable=True,
|
||||
help="Invalid indexing technique.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"external_knowledge_api_id",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"provider",
|
||||
type=str,
|
||||
nullable=True,
|
||||
choices=Dataset.PROVIDER_LIST,
|
||||
required=False,
|
||||
default="vendor",
|
||||
)
|
||||
parser.add_argument(
|
||||
"external_knowledge_id",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||
@@ -123,6 +143,9 @@ class DatasetListApi(Resource):
|
||||
indexing_technique=args["indexing_technique"],
|
||||
account=current_user,
|
||||
permission=DatasetPermissionEnum.ONLY_ME,
|
||||
provider=args["provider"],
|
||||
external_knowledge_api_id=args["external_knowledge_api_id"],
|
||||
external_knowledge_id=args["external_knowledge_id"],
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
@@ -211,6 +234,33 @@ class DatasetApi(Resource):
|
||||
)
|
||||
parser.add_argument("retrieval_model", type=dict, location="json", help="Invalid retrieval model.")
|
||||
parser.add_argument("partial_member_list", type=list, location="json", help="Invalid parent user list.")
|
||||
|
||||
parser.add_argument(
|
||||
"external_retrieval_model",
|
||||
type=dict,
|
||||
required=False,
|
||||
nullable=True,
|
||||
location="json",
|
||||
help="Invalid external retrieval model.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"external_knowledge_id",
|
||||
type=str,
|
||||
required=False,
|
||||
nullable=True,
|
||||
location="json",
|
||||
help="Invalid external knowledge id.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"external_knowledge_api_id",
|
||||
type=str,
|
||||
required=False,
|
||||
nullable=True,
|
||||
location="json",
|
||||
help="Invalid external knowledge api id.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
data = request.get_json()
|
||||
|
||||
|
||||
239
api/controllers/console/datasets/external.py
Normal file
239
api/controllers/console/datasets/external.py
Normal file
@@ -0,0 +1,239 @@
|
||||
from flask import request
|
||||
from flask_login import current_user
|
||||
from flask_restful import Resource, marshal, reqparse
|
||||
from werkzeug.exceptions import Forbidden, InternalServerError, NotFound
|
||||
|
||||
import services
|
||||
from controllers.console import api
|
||||
from controllers.console.datasets.error import DatasetNameDuplicateError
|
||||
from controllers.console.setup import setup_required
|
||||
from controllers.console.wraps import account_initialization_required
|
||||
from fields.dataset_fields import dataset_detail_fields
|
||||
from libs.login import login_required
|
||||
from services.dataset_service import DatasetService
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
from services.hit_testing_service import HitTestingService
|
||||
|
||||
|
||||
def _validate_name(name):
|
||||
if not name or len(name) < 1 or len(name) > 100:
|
||||
raise ValueError("Name must be between 1 to 100 characters.")
|
||||
return name
|
||||
|
||||
|
||||
def _validate_description_length(description):
|
||||
if description and len(description) > 400:
|
||||
raise ValueError("Description cannot exceed 400 characters.")
|
||||
return description
|
||||
|
||||
|
||||
class ExternalApiTemplateListApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self):
|
||||
page = request.args.get("page", default=1, type=int)
|
||||
limit = request.args.get("limit", default=20, type=int)
|
||||
search = request.args.get("keyword", default=None, type=str)
|
||||
|
||||
external_knowledge_apis, total = ExternalDatasetService.get_external_knowledge_apis(
|
||||
page, limit, current_user.current_tenant_id, search
|
||||
)
|
||||
response = {
|
||||
"data": [item.to_dict() for item in external_knowledge_apis],
|
||||
"has_more": len(external_knowledge_apis) == limit,
|
||||
"limit": limit,
|
||||
"total": total,
|
||||
"page": page,
|
||||
}
|
||||
return response, 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument(
|
||||
"name",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="Name is required. Name must be between 1 to 100 characters.",
|
||||
type=_validate_name,
|
||||
)
|
||||
parser.add_argument(
|
||||
"settings",
|
||||
type=dict,
|
||||
location="json",
|
||||
nullable=False,
|
||||
required=True,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
ExternalDatasetService.validate_api_list(args["settings"])
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||
if not current_user.is_dataset_editor:
|
||||
raise Forbidden()
|
||||
|
||||
try:
|
||||
external_knowledge_api = ExternalDatasetService.create_external_knowledge_api(
|
||||
tenant_id=current_user.current_tenant_id, user_id=current_user.id, args=args
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
|
||||
return external_knowledge_api.to_dict(), 201
|
||||
|
||||
|
||||
class ExternalApiTemplateApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, external_knowledge_api_id):
|
||||
external_knowledge_api_id = str(external_knowledge_api_id)
|
||||
external_knowledge_api = ExternalDatasetService.get_external_knowledge_api(external_knowledge_api_id)
|
||||
if external_knowledge_api is None:
|
||||
raise NotFound("API template not found.")
|
||||
|
||||
return external_knowledge_api.to_dict(), 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def patch(self, external_knowledge_api_id):
|
||||
external_knowledge_api_id = str(external_knowledge_api_id)
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument(
|
||||
"name",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="type is required. Name must be between 1 to 100 characters.",
|
||||
type=_validate_name,
|
||||
)
|
||||
parser.add_argument(
|
||||
"settings",
|
||||
type=dict,
|
||||
location="json",
|
||||
nullable=False,
|
||||
required=True,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
ExternalDatasetService.validate_api_list(args["settings"])
|
||||
|
||||
external_knowledge_api = ExternalDatasetService.update_external_knowledge_api(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
user_id=current_user.id,
|
||||
external_knowledge_api_id=external_knowledge_api_id,
|
||||
args=args,
|
||||
)
|
||||
|
||||
return external_knowledge_api.to_dict(), 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def delete(self, external_knowledge_api_id):
|
||||
external_knowledge_api_id = str(external_knowledge_api_id)
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor
|
||||
if not current_user.is_editor or current_user.is_dataset_operator:
|
||||
raise Forbidden()
|
||||
|
||||
ExternalDatasetService.delete_external_knowledge_api(current_user.current_tenant_id, external_knowledge_api_id)
|
||||
return {"result": "success"}, 200
|
||||
|
||||
|
||||
class ExternalApiUseCheckApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, external_knowledge_api_id):
|
||||
external_knowledge_api_id = str(external_knowledge_api_id)
|
||||
|
||||
external_knowledge_api_is_using, count = ExternalDatasetService.external_knowledge_api_use_check(
|
||||
external_knowledge_api_id
|
||||
)
|
||||
return {"is_using": external_knowledge_api_is_using, "count": count}, 200
|
||||
|
||||
|
||||
class ExternalDatasetCreateApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
# The role of the current user in the ta table must be admin, owner, or editor
|
||||
if not current_user.is_editor:
|
||||
raise Forbidden()
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("external_knowledge_api_id", type=str, required=True, nullable=False, location="json")
|
||||
parser.add_argument("external_knowledge_id", type=str, required=True, nullable=False, location="json")
|
||||
parser.add_argument(
|
||||
"name",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="name is required. Name must be between 1 to 100 characters.",
|
||||
type=_validate_name,
|
||||
)
|
||||
parser.add_argument("description", type=str, required=False, nullable=True, location="json")
|
||||
parser.add_argument("external_retrieval_model", type=dict, required=False, location="json")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||
if not current_user.is_dataset_editor:
|
||||
raise Forbidden()
|
||||
|
||||
try:
|
||||
dataset = ExternalDatasetService.create_external_dataset(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
user_id=current_user.id,
|
||||
args=args,
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
|
||||
return marshal(dataset, dataset_detail_fields), 201
|
||||
|
||||
|
||||
class ExternalKnowledgeHitTestingApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self, dataset_id):
|
||||
dataset_id_str = str(dataset_id)
|
||||
dataset = DatasetService.get_dataset(dataset_id_str)
|
||||
if dataset is None:
|
||||
raise NotFound("Dataset not found.")
|
||||
|
||||
try:
|
||||
DatasetService.check_dataset_permission(dataset, current_user)
|
||||
except services.errors.account.NoPermissionError as e:
|
||||
raise Forbidden(str(e))
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("query", type=str, location="json")
|
||||
parser.add_argument("external_retrieval_model", type=dict, required=False, location="json")
|
||||
args = parser.parse_args()
|
||||
|
||||
HitTestingService.hit_testing_args_check(args)
|
||||
|
||||
try:
|
||||
response = HitTestingService.external_retrieve(
|
||||
dataset=dataset,
|
||||
query=args["query"],
|
||||
account=current_user,
|
||||
external_retrieval_model=args["external_retrieval_model"],
|
||||
)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
raise InternalServerError(str(e))
|
||||
|
||||
|
||||
api.add_resource(ExternalKnowledgeHitTestingApi, "/datasets/<uuid:dataset_id>/external-hit-testing")
|
||||
api.add_resource(ExternalDatasetCreateApi, "/datasets/external")
|
||||
api.add_resource(ExternalApiTemplateListApi, "/datasets/external-knowledge-api")
|
||||
api.add_resource(ExternalApiTemplateApi, "/datasets/external-knowledge-api/<uuid:external_knowledge_api_id>")
|
||||
api.add_resource(ExternalApiUseCheckApi, "/datasets/external-knowledge-api/<uuid:external_knowledge_api_id>/use-check")
|
||||
@@ -47,6 +47,7 @@ class HitTestingApi(Resource):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("query", type=str, location="json")
|
||||
parser.add_argument("retrieval_model", type=dict, required=False, location="json")
|
||||
parser.add_argument("external_retrieval_model", type=dict, required=False, location="json")
|
||||
args = parser.parse_args()
|
||||
|
||||
HitTestingService.hit_testing_args_check(args)
|
||||
@@ -57,6 +58,7 @@ class HitTestingApi(Resource):
|
||||
query=args["query"],
|
||||
account=current_user,
|
||||
retrieval_model=args["retrieval_model"],
|
||||
external_retrieval_model=args["external_retrieval_model"],
|
||||
limit=10,
|
||||
)
|
||||
|
||||
|
||||
@@ -14,7 +14,9 @@ class WebsiteCrawlApi(Resource):
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, nullable=True, location="json")
|
||||
parser.add_argument(
|
||||
"provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json"
|
||||
)
|
||||
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
|
||||
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
|
||||
args = parser.parse_args()
|
||||
@@ -33,7 +35,7 @@ class WebsiteCrawlStatusApi(Resource):
|
||||
@account_initialization_required
|
||||
def get(self, job_id: str):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, location="args")
|
||||
parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args")
|
||||
args = parser.parse_args()
|
||||
# get crawl status
|
||||
try:
|
||||
|
||||
@@ -38,11 +38,52 @@ class VersionApi(Resource):
|
||||
return result
|
||||
|
||||
content = json.loads(response.content)
|
||||
result["version"] = content["version"]
|
||||
result["release_date"] = content["releaseDate"]
|
||||
result["release_notes"] = content["releaseNotes"]
|
||||
result["can_auto_update"] = content["canAutoUpdate"]
|
||||
if _has_new_version(latest_version=content["version"], current_version=f"{args.get('current_version')}"):
|
||||
result["version"] = content["version"]
|
||||
result["release_date"] = content["releaseDate"]
|
||||
result["release_notes"] = content["releaseNotes"]
|
||||
result["can_auto_update"] = content["canAutoUpdate"]
|
||||
return result
|
||||
|
||||
|
||||
def _has_new_version(*, latest_version: str, current_version: str) -> bool:
|
||||
def parse_version(version: str) -> tuple:
|
||||
# Split version into parts and pre-release suffix if any
|
||||
parts = version.split("-")
|
||||
version_parts = parts[0].split(".")
|
||||
pre_release = parts[1] if len(parts) > 1 else None
|
||||
|
||||
# Validate version format
|
||||
if len(version_parts) != 3:
|
||||
raise ValueError(f"Invalid version format: {version}")
|
||||
|
||||
try:
|
||||
# Convert version parts to integers
|
||||
major, minor, patch = map(int, version_parts)
|
||||
return (major, minor, patch, pre_release)
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid version format: {version}")
|
||||
|
||||
latest = parse_version(latest_version)
|
||||
current = parse_version(current_version)
|
||||
|
||||
# Compare major, minor, and patch versions
|
||||
for latest_part, current_part in zip(latest[:3], current[:3]):
|
||||
if latest_part > current_part:
|
||||
return True
|
||||
elif latest_part < current_part:
|
||||
return False
|
||||
|
||||
# If versions are equal, check pre-release suffixes
|
||||
if latest[3] is None and current[3] is not None:
|
||||
return True
|
||||
elif latest[3] is not None and current[3] is None:
|
||||
return False
|
||||
elif latest[3] is not None and current[3] is not None:
|
||||
# Simple string comparison for pre-release versions
|
||||
return latest[3] > current[3]
|
||||
|
||||
return False
|
||||
|
||||
|
||||
api.add_resource(VersionApi, "/version")
|
||||
|
||||
@@ -72,8 +72,9 @@ class DefaultModelApi(Resource):
|
||||
provider=model_setting["provider"],
|
||||
model=model_setting["model"],
|
||||
)
|
||||
except Exception:
|
||||
logging.warning(f"{model_setting['model_type']} save error")
|
||||
except Exception as ex:
|
||||
logging.exception(f"{model_setting['model_type']} save error: {ex}")
|
||||
raise ex
|
||||
|
||||
return {"result": "success"}
|
||||
|
||||
|
||||
7
api/controllers/files/error.py
Normal file
7
api/controllers/files/error.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from libs.exception import BaseHTTPException
|
||||
|
||||
|
||||
class UnsupportedFileTypeError(BaseHTTPException):
|
||||
error_code = "unsupported_file_type"
|
||||
description = "File type not allowed."
|
||||
code = 415
|
||||
@@ -4,7 +4,7 @@ from werkzeug.exceptions import NotFound
|
||||
|
||||
import services
|
||||
from controllers.files import api
|
||||
from libs.exception import BaseHTTPException
|
||||
from controllers.files.error import UnsupportedFileTypeError
|
||||
from services.account_service import TenantService
|
||||
from services.file_service import FileService
|
||||
|
||||
@@ -50,9 +50,3 @@ class WorkspaceWebappLogoApi(Resource):
|
||||
|
||||
api.add_resource(ImagePreviewApi, "/files/<uuid:file_id>/image-preview")
|
||||
api.add_resource(WorkspaceWebappLogoApi, "/files/workspaces/<uuid:workspace_id>/webapp-logo")
|
||||
|
||||
|
||||
class UnsupportedFileTypeError(BaseHTTPException):
|
||||
error_code = "unsupported_file_type"
|
||||
description = "File type not allowed."
|
||||
code = 415
|
||||
|
||||
@@ -3,8 +3,8 @@ from flask_restful import Resource, reqparse
|
||||
from werkzeug.exceptions import Forbidden, NotFound
|
||||
|
||||
from controllers.files import api
|
||||
from controllers.files.error import UnsupportedFileTypeError
|
||||
from core.tools.tool_file_manager import ToolFileManager
|
||||
from libs.exception import BaseHTTPException
|
||||
|
||||
|
||||
class ToolFilePreviewApi(Resource):
|
||||
@@ -43,9 +43,3 @@ class ToolFilePreviewApi(Resource):
|
||||
|
||||
|
||||
api.add_resource(ToolFilePreviewApi, "/files/tools/<uuid:file_id>.<string:extension>")
|
||||
|
||||
|
||||
class UnsupportedFileTypeError(BaseHTTPException):
|
||||
error_code = "unsupported_file_type"
|
||||
description = "File type not allowed."
|
||||
code = 415
|
||||
|
||||
@@ -4,6 +4,7 @@ from flask_restful import Resource, reqparse
|
||||
from werkzeug.exceptions import InternalServerError, NotFound
|
||||
|
||||
import services
|
||||
from constants import UUID_NIL
|
||||
from controllers.service_api import api
|
||||
from controllers.service_api.app.error import (
|
||||
AppUnavailableError,
|
||||
@@ -107,6 +108,7 @@ class ChatApi(Resource):
|
||||
parser.add_argument("conversation_id", type=uuid_value, location="json")
|
||||
parser.add_argument("retriever_from", type=str, required=False, default="dev", location="json")
|
||||
parser.add_argument("auto_generate_name", type=bool, required=False, default=True, location="json")
|
||||
parser.add_argument("parent_message_id", type=uuid_value, required=False, default=UUID_NIL, location="json")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
@@ -28,11 +28,11 @@ class DatasetListApi(DatasetApiResource):
|
||||
|
||||
page = request.args.get("page", default=1, type=int)
|
||||
limit = request.args.get("limit", default=20, type=int)
|
||||
provider = request.args.get("provider", default="vendor")
|
||||
# provider = request.args.get("provider", default="vendor")
|
||||
search = request.args.get("keyword", default=None, type=str)
|
||||
tag_ids = request.args.getlist("tag_ids")
|
||||
|
||||
datasets, total = DatasetService.get_datasets(page, limit, provider, tenant_id, current_user, search, tag_ids)
|
||||
datasets, total = DatasetService.get_datasets(page, limit, tenant_id, current_user, search, tag_ids)
|
||||
# check embedding setting
|
||||
provider_manager = ProviderManager()
|
||||
configurations = provider_manager.get_configurations(tenant_id=current_user.current_tenant_id)
|
||||
@@ -82,6 +82,26 @@ class DatasetListApi(DatasetApiResource):
|
||||
required=False,
|
||||
nullable=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"external_knowledge_api_id",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
default="_validate_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"provider",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
default="vendor",
|
||||
)
|
||||
parser.add_argument(
|
||||
"external_knowledge_id",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
@@ -91,6 +111,9 @@ class DatasetListApi(DatasetApiResource):
|
||||
indexing_technique=args["indexing_technique"],
|
||||
account=current_user,
|
||||
permission=args["permission"],
|
||||
provider=args["provider"],
|
||||
external_knowledge_api_id=args["external_knowledge_api_id"],
|
||||
external_knowledge_id=args["external_knowledge_id"],
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
|
||||
@@ -14,7 +14,7 @@ class CotAgentOutputParser:
|
||||
) -> Generator[Union[str, AgentScratchpadUnit.Action], None, None]:
|
||||
def parse_action(json_str):
|
||||
try:
|
||||
action = json.loads(json_str)
|
||||
action = json.loads(json_str, strict=False)
|
||||
action_name = None
|
||||
action_input = None
|
||||
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
class VariableError(Exception):
|
||||
class VariableError(ValueError):
|
||||
pass
|
||||
|
||||
@@ -44,7 +44,6 @@ class DatasetIndexToolCallbackHandler:
|
||||
DocumentSegment.index_node_id == document.metadata["doc_id"]
|
||||
)
|
||||
|
||||
# if 'dataset_id' in document.metadata:
|
||||
if "dataset_id" in document.metadata:
|
||||
query = query.filter(DocumentSegment.dataset_id == document.metadata["dataset_id"])
|
||||
|
||||
@@ -59,7 +58,7 @@ class DatasetIndexToolCallbackHandler:
|
||||
for item in resource:
|
||||
dataset_retriever_resource = DatasetRetrieverResource(
|
||||
message_id=self._message_id,
|
||||
position=item.get("position"),
|
||||
position=item.get("position") or 0,
|
||||
dataset_id=item.get("dataset_id"),
|
||||
dataset_name=item.get("dataset_name"),
|
||||
document_id=item.get("document_id"),
|
||||
|
||||
@@ -50,34 +50,62 @@ provider_credential_schema:
|
||||
label:
|
||||
en_US: US East (N. Virginia)
|
||||
zh_Hans: 美国东部 (弗吉尼亚北部)
|
||||
- value: us-east-2
|
||||
label:
|
||||
en_US: US East (Ohio)
|
||||
zh_Hans: 美国东部 (弗吉尼亚北部)
|
||||
- value: us-west-2
|
||||
label:
|
||||
en_US: US West (Oregon)
|
||||
zh_Hans: 美国西部 (俄勒冈州)
|
||||
- value: ap-south-1
|
||||
label:
|
||||
en_US: Asia Pacific (Mumbai)
|
||||
zh_Hans: 亚太地区(孟买)
|
||||
- value: ap-southeast-1
|
||||
label:
|
||||
en_US: Asia Pacific (Singapore)
|
||||
zh_Hans: 亚太地区 (新加坡)
|
||||
- value: ap-northeast-1
|
||||
label:
|
||||
en_US: Asia Pacific (Tokyo)
|
||||
zh_Hans: 亚太地区 (东京)
|
||||
- value: eu-central-1
|
||||
label:
|
||||
en_US: Europe (Frankfurt)
|
||||
zh_Hans: 欧洲 (法兰克福)
|
||||
- value: eu-west-2
|
||||
label:
|
||||
en_US: Eu west London (London)
|
||||
zh_Hans: 欧洲西部 (伦敦)
|
||||
- value: us-gov-west-1
|
||||
label:
|
||||
en_US: AWS GovCloud (US-West)
|
||||
zh_Hans: AWS GovCloud (US-West)
|
||||
- value: ap-southeast-2
|
||||
label:
|
||||
en_US: Asia Pacific (Sydney)
|
||||
zh_Hans: 亚太地区 (悉尼)
|
||||
- value: ap-northeast-1
|
||||
label:
|
||||
en_US: Asia Pacific (Tokyo)
|
||||
zh_Hans: 亚太地区 (东京)
|
||||
- value: ap-northeast-2
|
||||
label:
|
||||
en_US: Asia Pacific (Seoul)
|
||||
zh_Hans: 亚太地区(首尔)
|
||||
- value: ca-central-1
|
||||
label:
|
||||
en_US: Canada (Central)
|
||||
zh_Hans: 加拿大(中部)
|
||||
- value: eu-central-1
|
||||
label:
|
||||
en_US: Europe (Frankfurt)
|
||||
zh_Hans: 欧洲 (法兰克福)
|
||||
- value: eu-west-1
|
||||
label:
|
||||
en_US: Europe (Ireland)
|
||||
zh_Hans: 欧洲(爱尔兰)
|
||||
- value: eu-west-2
|
||||
label:
|
||||
en_US: Europe (London)
|
||||
zh_Hans: 欧洲西部 (伦敦)
|
||||
- value: eu-west-3
|
||||
label:
|
||||
en_US: Europe (Paris)
|
||||
zh_Hans: 欧洲(巴黎)
|
||||
- value: sa-east-1
|
||||
label:
|
||||
en_US: South America (São Paulo)
|
||||
zh_Hans: 南美洲(圣保罗)
|
||||
- value: us-gov-west-1
|
||||
label:
|
||||
en_US: AWS GovCloud (US-West)
|
||||
zh_Hans: AWS GovCloud (US-West)
|
||||
- variable: model_for_validation
|
||||
required: false
|
||||
label:
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
- gemini-1.5-pro
|
||||
- gemini-1.5-pro-latest
|
||||
- gemini-1.5-pro-001
|
||||
- gemini-1.5-pro-002
|
||||
- gemini-1.5-pro-exp-0801
|
||||
- gemini-1.5-pro-exp-0827
|
||||
- gemini-1.5-flash
|
||||
- gemini-1.5-flash-latest
|
||||
- gemini-1.5-flash-001
|
||||
- gemini-1.5-flash-002
|
||||
- gemini-1.5-flash-exp-0827
|
||||
- gemini-1.5-flash-8b-exp-0827
|
||||
- gemini-1.5-flash-8b-exp-0924
|
||||
- gemini-pro
|
||||
- gemini-pro-vision
|
||||
@@ -2,6 +2,8 @@ from typing import IO, Optional
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
from core.model_runtime.entities.common_entities import I18nObject
|
||||
from core.model_runtime.entities.model_entities import AIModelEntity, FetchFrom, ModelType
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.speech2text_model import Speech2TextModel
|
||||
from core.model_runtime.model_providers.openai._common import _CommonOpenAI
|
||||
@@ -58,3 +60,18 @@ class OpenAISpeech2TextModel(_CommonOpenAI, Speech2TextModel):
|
||||
response = client.audio.transcriptions.create(model=model, file=file)
|
||||
|
||||
return response.text
|
||||
|
||||
def get_customizable_model_schema(self, model: str, credentials: dict) -> AIModelEntity | None:
|
||||
"""
|
||||
used to define customizable model schema
|
||||
"""
|
||||
entity = AIModelEntity(
|
||||
model=model,
|
||||
label=I18nObject(en_US=model),
|
||||
fetch_from=FetchFrom.CUSTOMIZABLE_MODEL,
|
||||
model_type=ModelType.SPEECH2TEXT,
|
||||
model_properties={},
|
||||
parameter_rules=[],
|
||||
)
|
||||
|
||||
return entity
|
||||
|
||||
@@ -3,6 +3,8 @@ from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
|
||||
from core.model_runtime.entities.common_entities import I18nObject
|
||||
from core.model_runtime.entities.model_entities import AIModelEntity, FetchFrom, ModelType
|
||||
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.speech2text_model import Speech2TextModel
|
||||
@@ -59,3 +61,18 @@ class OAICompatSpeech2TextModel(_CommonOaiApiCompat, Speech2TextModel):
|
||||
self._invoke(model, credentials, audio_file)
|
||||
except Exception as ex:
|
||||
raise CredentialsValidateFailedError(str(ex))
|
||||
|
||||
def get_customizable_model_schema(self, model: str, credentials: dict) -> AIModelEntity | None:
|
||||
"""
|
||||
used to define customizable model schema
|
||||
"""
|
||||
entity = AIModelEntity(
|
||||
model=model,
|
||||
label=I18nObject(en_US=model),
|
||||
fetch_from=FetchFrom.CUSTOMIZABLE_MODEL,
|
||||
model_type=ModelType.SPEECH2TEXT,
|
||||
model_properties={},
|
||||
parameter_rules=[],
|
||||
)
|
||||
|
||||
return entity
|
||||
|
||||
@@ -14,6 +14,10 @@
|
||||
- google/gemini-pro
|
||||
- cohere/command-r-plus
|
||||
- cohere/command-r
|
||||
- meta-llama/llama-3.2-1b-instruct
|
||||
- meta-llama/llama-3.2-3b-instruct
|
||||
- meta-llama/llama-3.2-11b-vision-instruct
|
||||
- meta-llama/llama-3.2-90b-vision-instruct
|
||||
- meta-llama/llama-3.1-405b-instruct
|
||||
- meta-llama/llama-3.1-70b-instruct
|
||||
- meta-llama/llama-3.1-8b-instruct
|
||||
@@ -22,6 +26,7 @@
|
||||
- mistralai/mixtral-8x22b-instruct
|
||||
- mistralai/mixtral-8x7b-instruct
|
||||
- mistralai/mistral-7b-instruct
|
||||
- qwen/qwen-2.5-72b-instruct
|
||||
- qwen/qwen-2-72b-instruct
|
||||
- deepseek/deepseek-chat
|
||||
- deepseek/deepseek-coder
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
model: meta-llama/llama-3.2-11b-vision-instruct
|
||||
label:
|
||||
zh_Hans: llama-3.2-11b-vision-instruct
|
||||
en_US: llama-3.2-11b-vision-instruct
|
||||
model_type: llm
|
||||
features:
|
||||
- agent-thought
|
||||
model_properties:
|
||||
mode: chat
|
||||
context_size: 131072
|
||||
parameter_rules:
|
||||
- name: temperature
|
||||
use_template: temperature
|
||||
- name: top_p
|
||||
use_template: top_p
|
||||
- name: top_k
|
||||
label:
|
||||
zh_Hans: 取样数量
|
||||
en_US: Top k
|
||||
type: int
|
||||
help:
|
||||
zh_Hans: 仅从每个后续标记的前 K 个选项中采样。
|
||||
en_US: Only sample from the top K options for each subsequent token.
|
||||
- name: max_tokens
|
||||
use_template: max_tokens
|
||||
- name: context_length_exceeded_behavior
|
||||
default: None
|
||||
label:
|
||||
zh_Hans: 上下文长度超出行为
|
||||
en_US: Context Length Exceeded Behavior
|
||||
help:
|
||||
zh_Hans: 上下文长度超出行为
|
||||
en_US: Context Length Exceeded Behavior
|
||||
type: string
|
||||
options:
|
||||
- None
|
||||
- truncate
|
||||
- error
|
||||
- name: response_format
|
||||
use_template: response_format
|
||||
pricing:
|
||||
input: '0.055'
|
||||
output: '0.055'
|
||||
unit: '0.000001'
|
||||
currency: USD
|
||||
@@ -0,0 +1,45 @@
|
||||
model: meta-llama/llama-3.2-1b-instruct
|
||||
label:
|
||||
zh_Hans: llama-3.2-1b-instruct
|
||||
en_US: llama-3.2-1b-instruct
|
||||
model_type: llm
|
||||
features:
|
||||
- agent-thought
|
||||
model_properties:
|
||||
mode: chat
|
||||
context_size: 131072
|
||||
parameter_rules:
|
||||
- name: temperature
|
||||
use_template: temperature
|
||||
- name: top_p
|
||||
use_template: top_p
|
||||
- name: top_k
|
||||
label:
|
||||
zh_Hans: 取样数量
|
||||
en_US: Top k
|
||||
type: int
|
||||
help:
|
||||
zh_Hans: 仅从每个后续标记的前 K 个选项中采样。
|
||||
en_US: Only sample from the top K options for each subsequent token.
|
||||
- name: max_tokens
|
||||
use_template: max_tokens
|
||||
- name: context_length_exceeded_behavior
|
||||
default: None
|
||||
label:
|
||||
zh_Hans: 上下文长度超出行为
|
||||
en_US: Context Length Exceeded Behavior
|
||||
help:
|
||||
zh_Hans: 上下文长度超出行为
|
||||
en_US: Context Length Exceeded Behavior
|
||||
type: string
|
||||
options:
|
||||
- None
|
||||
- truncate
|
||||
- error
|
||||
- name: response_format
|
||||
use_template: response_format
|
||||
pricing:
|
||||
input: '0.01'
|
||||
output: '0.02'
|
||||
unit: '0.000001'
|
||||
currency: USD
|
||||
@@ -0,0 +1,45 @@
|
||||
model: meta-llama/llama-3.2-3b-instruct
|
||||
label:
|
||||
zh_Hans: llama-3.2-3b-instruct
|
||||
en_US: llama-3.2-3b-instruct
|
||||
model_type: llm
|
||||
features:
|
||||
- agent-thought
|
||||
model_properties:
|
||||
mode: chat
|
||||
context_size: 131072
|
||||
parameter_rules:
|
||||
- name: temperature
|
||||
use_template: temperature
|
||||
- name: top_p
|
||||
use_template: top_p
|
||||
- name: top_k
|
||||
label:
|
||||
zh_Hans: 取样数量
|
||||
en_US: Top k
|
||||
type: int
|
||||
help:
|
||||
zh_Hans: 仅从每个后续标记的前 K 个选项中采样。
|
||||
en_US: Only sample from the top K options for each subsequent token.
|
||||
- name: max_tokens
|
||||
use_template: max_tokens
|
||||
- name: context_length_exceeded_behavior
|
||||
default: None
|
||||
label:
|
||||
zh_Hans: 上下文长度超出行为
|
||||
en_US: Context Length Exceeded Behavior
|
||||
help:
|
||||
zh_Hans: 上下文长度超出行为
|
||||
en_US: Context Length Exceeded Behavior
|
||||
type: string
|
||||
options:
|
||||
- None
|
||||
- truncate
|
||||
- error
|
||||
- name: response_format
|
||||
use_template: response_format
|
||||
pricing:
|
||||
input: '0.03'
|
||||
output: '0.05'
|
||||
unit: '0.000001'
|
||||
currency: USD
|
||||
@@ -0,0 +1,45 @@
|
||||
model: meta-llama/llama-3.2-90b-vision-instruct
|
||||
label:
|
||||
zh_Hans: llama-3.2-90b-vision-instruct
|
||||
en_US: llama-3.2-90b-vision-instruct
|
||||
model_type: llm
|
||||
features:
|
||||
- agent-thought
|
||||
model_properties:
|
||||
mode: chat
|
||||
context_size: 131072
|
||||
parameter_rules:
|
||||
- name: temperature
|
||||
use_template: temperature
|
||||
- name: top_p
|
||||
use_template: top_p
|
||||
- name: top_k
|
||||
label:
|
||||
zh_Hans: 取样数量
|
||||
en_US: Top k
|
||||
type: int
|
||||
help:
|
||||
zh_Hans: 仅从每个后续标记的前 K 个选项中采样。
|
||||
en_US: Only sample from the top K options for each subsequent token.
|
||||
- name: max_tokens
|
||||
use_template: max_tokens
|
||||
- name: context_length_exceeded_behavior
|
||||
default: None
|
||||
label:
|
||||
zh_Hans: 上下文长度超出行为
|
||||
en_US: Context Length Exceeded Behavior
|
||||
help:
|
||||
zh_Hans: 上下文长度超出行为
|
||||
en_US: Context Length Exceeded Behavior
|
||||
type: string
|
||||
options:
|
||||
- None
|
||||
- truncate
|
||||
- error
|
||||
- name: response_format
|
||||
use_template: response_format
|
||||
pricing:
|
||||
input: '0.35'
|
||||
output: '0.4'
|
||||
unit: '0.000001'
|
||||
currency: USD
|
||||
@@ -0,0 +1,30 @@
|
||||
model: qwen/qwen-2.5-72b-instruct
|
||||
label:
|
||||
en_US: qwen-2.5-72b-instruct
|
||||
model_type: llm
|
||||
features:
|
||||
- agent-thought
|
||||
model_properties:
|
||||
mode: chat
|
||||
context_size: 131072
|
||||
parameter_rules:
|
||||
- name: temperature
|
||||
use_template: temperature
|
||||
- name: max_tokens
|
||||
use_template: max_tokens
|
||||
type: int
|
||||
default: 512
|
||||
min: 1
|
||||
max: 8192
|
||||
help:
|
||||
zh_Hans: 指定生成结果长度的上限。如果生成结果截断,可以调大该参数。
|
||||
en_US: Specifies the upper limit on the length of generated results. If the generated results are truncated, you can increase this parameter.
|
||||
- name: top_p
|
||||
use_template: top_p
|
||||
- name: frequency_penalty
|
||||
use_template: frequency_penalty
|
||||
pricing:
|
||||
input: "0.35"
|
||||
output: "0.4"
|
||||
unit: "0.000001"
|
||||
currency: USD
|
||||
@@ -0,0 +1,4 @@
|
||||
- rerank-2
|
||||
- rerank-lite-2
|
||||
- rerank-1
|
||||
- rerank-lite-1
|
||||
@@ -0,0 +1,4 @@
|
||||
model: rerank-2
|
||||
model_type: rerank
|
||||
model_properties:
|
||||
context_size: 16000
|
||||
@@ -0,0 +1,4 @@
|
||||
model: rerank-lite-2
|
||||
model_type: rerank
|
||||
model_properties:
|
||||
context_size: 8000
|
||||
@@ -0,0 +1,6 @@
|
||||
- voyage-3
|
||||
- voyage-3-lite
|
||||
- voyage-finance-2
|
||||
- voyage-multilingual-2
|
||||
- voyage-law-2
|
||||
- voyage-code-2
|
||||
@@ -0,0 +1,8 @@
|
||||
model: voyage-code-2
|
||||
model_type: text-embedding
|
||||
model_properties:
|
||||
context_size: 16000
|
||||
pricing:
|
||||
input: '0.00012'
|
||||
unit: '0.001'
|
||||
currency: USD
|
||||
@@ -0,0 +1,8 @@
|
||||
model: voyage-finance-2
|
||||
model_type: text-embedding
|
||||
model_properties:
|
||||
context_size: 32000
|
||||
pricing:
|
||||
input: '0.00012'
|
||||
unit: '0.001'
|
||||
currency: USD
|
||||
@@ -0,0 +1,8 @@
|
||||
model: voyage-law-2
|
||||
model_type: text-embedding
|
||||
model_properties:
|
||||
context_size: 16000
|
||||
pricing:
|
||||
input: '0.00012'
|
||||
unit: '0.001'
|
||||
currency: USD
|
||||
@@ -0,0 +1,8 @@
|
||||
model: voyage-multilingual-2
|
||||
model_type: text-embedding
|
||||
model_properties:
|
||||
context_size: 32000
|
||||
pricing:
|
||||
input: '0.00012'
|
||||
unit: '0.001'
|
||||
currency: USD
|
||||
@@ -59,6 +59,7 @@ from core.model_runtime.model_providers.__base.large_language_model import Large
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import (
|
||||
XinferenceHelper,
|
||||
XinferenceModelExtraParameter,
|
||||
validate_model_uid,
|
||||
)
|
||||
from core.model_runtime.utils import helper
|
||||
|
||||
@@ -114,7 +115,7 @@ class XinferenceAILargeLanguageModel(LargeLanguageModel):
|
||||
}
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
extra_param = XinferenceHelper.get_xinference_extra_parameter(
|
||||
|
||||
@@ -15,6 +15,7 @@ from core.model_runtime.errors.invoke import (
|
||||
)
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.rerank_model import RerankModel
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import validate_model_uid
|
||||
|
||||
|
||||
class XinferenceRerankModel(RerankModel):
|
||||
@@ -77,10 +78,7 @@ class XinferenceRerankModel(RerankModel):
|
||||
)
|
||||
|
||||
# score threshold check
|
||||
if score_threshold is not None:
|
||||
if result["relevance_score"] >= score_threshold:
|
||||
rerank_documents.append(rerank_document)
|
||||
else:
|
||||
if score_threshold is None or result["relevance_score"] >= score_threshold:
|
||||
rerank_documents.append(rerank_document)
|
||||
|
||||
return RerankResult(model=model, docs=rerank_documents)
|
||||
@@ -94,7 +92,7 @@ class XinferenceRerankModel(RerankModel):
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
credentials["server_url"] = credentials["server_url"].removesuffix("/")
|
||||
|
||||
@@ -14,6 +14,7 @@ from core.model_runtime.errors.invoke import (
|
||||
)
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.speech2text_model import Speech2TextModel
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import validate_model_uid
|
||||
|
||||
|
||||
class XinferenceSpeech2TextModel(Speech2TextModel):
|
||||
@@ -42,7 +43,7 @@ class XinferenceSpeech2TextModel(Speech2TextModel):
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
credentials["server_url"] = credentials["server_url"].removesuffix("/")
|
||||
|
||||
@@ -17,7 +17,7 @@ from core.model_runtime.errors.invoke import (
|
||||
)
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.text_embedding_model import TextEmbeddingModel
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper, validate_model_uid
|
||||
|
||||
|
||||
class XinferenceTextEmbeddingModel(TextEmbeddingModel):
|
||||
@@ -110,7 +110,7 @@ class XinferenceTextEmbeddingModel(TextEmbeddingModel):
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
server_url = credentials["server_url"]
|
||||
|
||||
@@ -15,7 +15,7 @@ from core.model_runtime.errors.invoke import (
|
||||
)
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper, validate_model_uid
|
||||
|
||||
|
||||
class XinferenceText2SpeechModel(TTSModel):
|
||||
@@ -70,7 +70,7 @@ class XinferenceText2SpeechModel(TTSModel):
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
credentials["server_url"] = credentials["server_url"].removesuffix("/")
|
||||
|
||||
@@ -132,3 +132,16 @@ class XinferenceHelper:
|
||||
context_length=context_length,
|
||||
model_family=model_family,
|
||||
)
|
||||
|
||||
|
||||
def validate_model_uid(credentials: dict) -> bool:
|
||||
"""
|
||||
Validate the model_uid within the credentials dictionary to ensure it does not
|
||||
contain forbidden characters ("/", "?", "#").
|
||||
|
||||
param credentials: model credentials
|
||||
:return: True if the model_uid does not contain forbidden characters ("/", "?", "#"), else False.
|
||||
"""
|
||||
forbidden_characters = ["/", "?", "#"]
|
||||
model_uid = credentials.get("model_uid", "")
|
||||
return not any(char in forbidden_characters for char in model_uid)
|
||||
|
||||
@@ -18,8 +18,12 @@ class KeywordsModeration(Moderation):
|
||||
if not config.get("keywords"):
|
||||
raise ValueError("keywords is required")
|
||||
|
||||
if len(config.get("keywords")) > 1000:
|
||||
raise ValueError("keywords length must be less than 1000")
|
||||
if len(config.get("keywords")) > 10000:
|
||||
raise ValueError("keywords length must be less than 10000")
|
||||
|
||||
keywords_row_len = config["keywords"].split("\n")
|
||||
if len(keywords_row_len) > 100:
|
||||
raise ValueError("the number of rows for the keywords must be less than 100")
|
||||
|
||||
def moderation_for_inputs(self, inputs: dict, query: str = "") -> ModerationInputsResult:
|
||||
flagged = False
|
||||
|
||||
@@ -159,6 +159,16 @@ class LangFuseDataTrace(BaseTraceInstance):
|
||||
"status": status,
|
||||
}
|
||||
)
|
||||
process_data = json.loads(node_execution.process_data) if node_execution.process_data else {}
|
||||
model_provider = process_data.get("model_provider", None)
|
||||
model_name = process_data.get("model_name", None)
|
||||
if model_provider is not None and model_name is not None:
|
||||
metadata.update(
|
||||
{
|
||||
"model_provider": model_provider,
|
||||
"model_name": model_name,
|
||||
}
|
||||
)
|
||||
|
||||
# add span
|
||||
if trace_info.message_id:
|
||||
@@ -191,7 +201,6 @@ class LangFuseDataTrace(BaseTraceInstance):
|
||||
|
||||
self.add_span(langfuse_span_data=span_data)
|
||||
|
||||
process_data = json.loads(node_execution.process_data) if node_execution.process_data else {}
|
||||
if process_data and process_data.get("model_mode") == "chat":
|
||||
total_token = metadata.get("total_tokens", 0)
|
||||
# add generation
|
||||
|
||||
@@ -10,6 +10,7 @@ from core.rag.rerank.constants.rerank_mode import RerankMode
|
||||
from core.rag.retrieval.retrieval_methods import RetrievalMethod
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
|
||||
default_retrieval_model = {
|
||||
"search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
|
||||
@@ -34,6 +35,9 @@ class RetrievalService:
|
||||
weights: Optional[dict] = None,
|
||||
):
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
return []
|
||||
|
||||
if not dataset or dataset.available_document_count == 0 or dataset.available_segment_count == 0:
|
||||
return []
|
||||
all_documents = []
|
||||
@@ -108,6 +112,16 @@ class RetrievalService:
|
||||
)
|
||||
return all_documents
|
||||
|
||||
@classmethod
|
||||
def external_retrieve(cls, dataset_id: str, query: str, external_retrieval_model: Optional[dict] = None):
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
return []
|
||||
all_documents = ExternalDatasetService.fetch_external_knowledge_retrieval(
|
||||
dataset.tenant_id, dataset_id, query, external_retrieval_model
|
||||
)
|
||||
return all_documents
|
||||
|
||||
@classmethod
|
||||
def keyword_search(
|
||||
cls, flask_app: Flask, dataset_id: str, query: str, top_k: int, all_documents: list, exceptions: list
|
||||
|
||||
@@ -166,7 +166,7 @@ class PGVector(BaseVector):
|
||||
|
||||
with self._get_cursor() as cur:
|
||||
cur.execute(
|
||||
f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), to_tsquery(%s)) AS score
|
||||
f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score
|
||||
FROM {self.table_name}
|
||||
WHERE to_tsvector(text) @@ plainto_tsquery(%s)
|
||||
ORDER BY score DESC
|
||||
|
||||
10
api/core/rag/entities/context_entities.py
Normal file
10
api/core/rag/entities/context_entities.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class DocumentContext(BaseModel):
|
||||
"""
|
||||
Model class for document context.
|
||||
"""
|
||||
|
||||
content: str
|
||||
score: float
|
||||
@@ -12,6 +12,7 @@ from core.rag.extractor.entity.extract_setting import ExtractSetting
|
||||
from core.rag.extractor.excel_extractor import ExcelExtractor
|
||||
from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
|
||||
from core.rag.extractor.html_extractor import HtmlExtractor
|
||||
from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor
|
||||
from core.rag.extractor.markdown_extractor import MarkdownExtractor
|
||||
from core.rag.extractor.notion_extractor import NotionExtractor
|
||||
from core.rag.extractor.pdf_extractor import PdfExtractor
|
||||
@@ -171,6 +172,15 @@ class ExtractProcessor:
|
||||
only_main_content=extract_setting.website_info.only_main_content,
|
||||
)
|
||||
return extractor.extract()
|
||||
elif extract_setting.website_info.provider == "jinareader":
|
||||
extractor = JinaReaderWebExtractor(
|
||||
url=extract_setting.website_info.url,
|
||||
job_id=extract_setting.website_info.job_id,
|
||||
tenant_id=extract_setting.website_info.tenant_id,
|
||||
mode=extract_setting.website_info.mode,
|
||||
only_main_content=extract_setting.website_info.only_main_content,
|
||||
)
|
||||
return extractor.extract()
|
||||
else:
|
||||
raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
|
||||
else:
|
||||
|
||||
35
api/core/rag/extractor/jina_reader_extractor.py
Normal file
35
api/core/rag/extractor/jina_reader_extractor.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
|
||||
|
||||
class JinaReaderWebExtractor(BaseExtractor):
|
||||
"""
|
||||
Crawl and scrape websites and return content in clean llm-ready markdown.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
|
||||
"""Initialize with url, api_key, base_url and mode."""
|
||||
self._url = url
|
||||
self.job_id = job_id
|
||||
self.tenant_id = tenant_id
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
if self.mode == "crawl":
|
||||
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
|
||||
if crawl_data is None:
|
||||
return []
|
||||
document = Document(
|
||||
page_content=crawl_data.get("content", ""),
|
||||
metadata={
|
||||
"source_url": crawl_data.get("url"),
|
||||
"description": crawl_data.get("description"),
|
||||
"title": crawl_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
return documents
|
||||
@@ -17,6 +17,8 @@ class Document(BaseModel):
|
||||
"""
|
||||
metadata: Optional[dict] = Field(default_factory=dict)
|
||||
|
||||
provider: Optional[str] = "dify"
|
||||
|
||||
|
||||
class BaseDocumentTransformer(ABC):
|
||||
"""Abstract base class for document transformation systems.
|
||||
|
||||
@@ -28,11 +28,16 @@ class RerankModelRunner:
|
||||
docs = []
|
||||
doc_id = []
|
||||
unique_documents = []
|
||||
for document in documents:
|
||||
dify_documents = [item for item in documents if item.provider == "dify"]
|
||||
external_documents = [item for item in documents if item.provider == "external"]
|
||||
for document in dify_documents:
|
||||
if document.metadata["doc_id"] not in doc_id:
|
||||
doc_id.append(document.metadata["doc_id"])
|
||||
docs.append(document.page_content)
|
||||
unique_documents.append(document)
|
||||
for document in external_documents:
|
||||
docs.append(document.page_content)
|
||||
unique_documents.append(document)
|
||||
|
||||
documents = unique_documents
|
||||
|
||||
@@ -46,14 +51,10 @@ class RerankModelRunner:
|
||||
# format document
|
||||
rerank_document = Document(
|
||||
page_content=result.text,
|
||||
metadata={
|
||||
"doc_id": documents[result.index].metadata["doc_id"],
|
||||
"doc_hash": documents[result.index].metadata["doc_hash"],
|
||||
"document_id": documents[result.index].metadata["document_id"],
|
||||
"dataset_id": documents[result.index].metadata["dataset_id"],
|
||||
"score": result.score,
|
||||
},
|
||||
metadata=documents[result.index].metadata,
|
||||
provider=documents[result.index].provider,
|
||||
)
|
||||
rerank_document.metadata["score"] = result.score
|
||||
rerank_documents.append(rerank_document)
|
||||
|
||||
return rerank_documents
|
||||
|
||||
@@ -20,6 +20,7 @@ from core.ops.utils import measure_time
|
||||
from core.rag.data_post_processor.data_post_processor import DataPostProcessor
|
||||
from core.rag.datasource.keyword.jieba.jieba_keyword_table_handler import JiebaKeywordTableHandler
|
||||
from core.rag.datasource.retrieval_service import RetrievalService
|
||||
from core.rag.entities.context_entities import DocumentContext
|
||||
from core.rag.models.document import Document
|
||||
from core.rag.retrieval.retrieval_methods import RetrievalMethod
|
||||
from core.rag.retrieval.router.multi_dataset_function_call_router import FunctionCallMultiDatasetRouter
|
||||
@@ -30,6 +31,7 @@ from core.tools.tool.dataset_retriever.dataset_retriever_tool import DatasetRetr
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset, DatasetQuery, DocumentSegment
|
||||
from models.dataset import Document as DatasetDocument
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
|
||||
default_retrieval_model = {
|
||||
"search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
|
||||
@@ -110,7 +112,7 @@ class DatasetRetrieval:
|
||||
continue
|
||||
|
||||
# pass if dataset is not available
|
||||
if dataset and dataset.available_document_count == 0:
|
||||
if dataset and dataset.available_document_count == 0 and dataset.provider != "external":
|
||||
continue
|
||||
|
||||
available_datasets.append(dataset)
|
||||
@@ -146,69 +148,93 @@ class DatasetRetrieval:
|
||||
message_id,
|
||||
)
|
||||
|
||||
document_score_list = {}
|
||||
for item in all_documents:
|
||||
if item.metadata.get("score"):
|
||||
document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
|
||||
|
||||
dify_documents = [item for item in all_documents if item.provider == "dify"]
|
||||
external_documents = [item for item in all_documents if item.provider == "external"]
|
||||
document_context_list = []
|
||||
index_node_ids = [document.metadata["doc_id"] for document in all_documents]
|
||||
segments = DocumentSegment.query.filter(
|
||||
DocumentSegment.dataset_id.in_(dataset_ids),
|
||||
DocumentSegment.completed_at.isnot(None),
|
||||
DocumentSegment.status == "completed",
|
||||
DocumentSegment.enabled == True,
|
||||
DocumentSegment.index_node_id.in_(index_node_ids),
|
||||
).all()
|
||||
retrieval_resource_list = []
|
||||
# deal with external documents
|
||||
for item in external_documents:
|
||||
document_context_list.append(DocumentContext(content=item.page_content, score=item.metadata.get("score")))
|
||||
source = {
|
||||
"dataset_id": item.metadata.get("dataset_id"),
|
||||
"dataset_name": item.metadata.get("dataset_name"),
|
||||
"document_name": item.metadata.get("title"),
|
||||
"data_source_type": "external",
|
||||
"retriever_from": invoke_from.to_source(),
|
||||
"score": item.metadata.get("score"),
|
||||
"content": item.page_content,
|
||||
}
|
||||
retrieval_resource_list.append(source)
|
||||
document_score_list = {}
|
||||
# deal with dify documents
|
||||
if dify_documents:
|
||||
for item in dify_documents:
|
||||
if item.metadata.get("score"):
|
||||
document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
|
||||
|
||||
if segments:
|
||||
index_node_id_to_position = {id: position for position, id in enumerate(index_node_ids)}
|
||||
sorted_segments = sorted(
|
||||
segments, key=lambda segment: index_node_id_to_position.get(segment.index_node_id, float("inf"))
|
||||
)
|
||||
for segment in sorted_segments:
|
||||
if segment.answer:
|
||||
document_context_list.append(f"question:{segment.get_sign_content()} answer:{segment.answer}")
|
||||
else:
|
||||
document_context_list.append(segment.get_sign_content())
|
||||
if show_retrieve_source:
|
||||
context_list = []
|
||||
resource_number = 1
|
||||
index_node_ids = [document.metadata["doc_id"] for document in dify_documents]
|
||||
segments = DocumentSegment.query.filter(
|
||||
DocumentSegment.dataset_id.in_(dataset_ids),
|
||||
DocumentSegment.status == "completed",
|
||||
DocumentSegment.enabled == True,
|
||||
DocumentSegment.index_node_id.in_(index_node_ids),
|
||||
).all()
|
||||
|
||||
if segments:
|
||||
index_node_id_to_position = {id: position for position, id in enumerate(index_node_ids)}
|
||||
sorted_segments = sorted(
|
||||
segments, key=lambda segment: index_node_id_to_position.get(segment.index_node_id, float("inf"))
|
||||
)
|
||||
for segment in sorted_segments:
|
||||
dataset = Dataset.query.filter_by(id=segment.dataset_id).first()
|
||||
document = DatasetDocument.query.filter(
|
||||
DatasetDocument.id == segment.document_id,
|
||||
DatasetDocument.enabled == True,
|
||||
DatasetDocument.archived == False,
|
||||
).first()
|
||||
if dataset and document:
|
||||
source = {
|
||||
"position": resource_number,
|
||||
"dataset_id": dataset.id,
|
||||
"dataset_name": dataset.name,
|
||||
"document_id": document.id,
|
||||
"document_name": document.name,
|
||||
"data_source_type": document.data_source_type,
|
||||
"segment_id": segment.id,
|
||||
"retriever_from": invoke_from.to_source(),
|
||||
"score": document_score_list.get(segment.index_node_id, None),
|
||||
}
|
||||
if segment.answer:
|
||||
document_context_list.append(
|
||||
DocumentContext(
|
||||
content=f"question:{segment.get_sign_content()} answer:{segment.answer}",
|
||||
score=document_score_list.get(segment.index_node_id, None),
|
||||
)
|
||||
)
|
||||
else:
|
||||
document_context_list.append(
|
||||
DocumentContext(
|
||||
content=segment.get_sign_content(),
|
||||
score=document_score_list.get(segment.index_node_id, None),
|
||||
)
|
||||
)
|
||||
if show_retrieve_source:
|
||||
for segment in sorted_segments:
|
||||
dataset = Dataset.query.filter_by(id=segment.dataset_id).first()
|
||||
document = DatasetDocument.query.filter(
|
||||
DatasetDocument.id == segment.document_id,
|
||||
DatasetDocument.enabled == True,
|
||||
DatasetDocument.archived == False,
|
||||
).first()
|
||||
if dataset and document:
|
||||
source = {
|
||||
"dataset_id": dataset.id,
|
||||
"dataset_name": dataset.name,
|
||||
"document_id": document.id,
|
||||
"document_name": document.name,
|
||||
"data_source_type": document.data_source_type,
|
||||
"segment_id": segment.id,
|
||||
"retriever_from": invoke_from.to_source(),
|
||||
"score": document_score_list.get(segment.index_node_id, None),
|
||||
}
|
||||
|
||||
if invoke_from.to_source() == "dev":
|
||||
source["hit_count"] = segment.hit_count
|
||||
source["word_count"] = segment.word_count
|
||||
source["segment_position"] = segment.position
|
||||
source["index_node_hash"] = segment.index_node_hash
|
||||
if segment.answer:
|
||||
source["content"] = f"question:{segment.content} \nanswer:{segment.answer}"
|
||||
else:
|
||||
source["content"] = segment.content
|
||||
context_list.append(source)
|
||||
resource_number += 1
|
||||
if hit_callback:
|
||||
hit_callback.return_retriever_resource_info(context_list)
|
||||
|
||||
return str("\n".join(document_context_list))
|
||||
if invoke_from.to_source() == "dev":
|
||||
source["hit_count"] = segment.hit_count
|
||||
source["word_count"] = segment.word_count
|
||||
source["segment_position"] = segment.position
|
||||
source["index_node_hash"] = segment.index_node_hash
|
||||
if segment.answer:
|
||||
source["content"] = f"question:{segment.content} \nanswer:{segment.answer}"
|
||||
else:
|
||||
source["content"] = segment.content
|
||||
retrieval_resource_list.append(source)
|
||||
if hit_callback and retrieval_resource_list:
|
||||
hit_callback.return_retriever_resource_info(retrieval_resource_list)
|
||||
if document_context_list:
|
||||
document_context_list = sorted(document_context_list, key=lambda x: x.score, reverse=True)
|
||||
return str("\n".join([document_context.content for document_context in document_context_list]))
|
||||
return ""
|
||||
|
||||
def single_retrieve(
|
||||
@@ -256,36 +282,58 @@ class DatasetRetrieval:
|
||||
# get retrieval model config
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if dataset:
|
||||
retrieval_model_config = dataset.retrieval_model or default_retrieval_model
|
||||
|
||||
# get top k
|
||||
top_k = retrieval_model_config["top_k"]
|
||||
# get retrieval method
|
||||
if dataset.indexing_technique == "economy":
|
||||
retrieval_method = "keyword_search"
|
||||
else:
|
||||
retrieval_method = retrieval_model_config["search_method"]
|
||||
# get reranking model
|
||||
reranking_model = (
|
||||
retrieval_model_config["reranking_model"] if retrieval_model_config["reranking_enable"] else None
|
||||
)
|
||||
# get score threshold
|
||||
score_threshold = 0.0
|
||||
score_threshold_enabled = retrieval_model_config.get("score_threshold_enabled")
|
||||
if score_threshold_enabled:
|
||||
score_threshold = retrieval_model_config.get("score_threshold")
|
||||
|
||||
with measure_time() as timer:
|
||||
results = RetrievalService.retrieve(
|
||||
retrieval_method=retrieval_method,
|
||||
dataset_id=dataset.id,
|
||||
results = []
|
||||
if dataset.provider == "external":
|
||||
external_documents = ExternalDatasetService.fetch_external_knowledge_retrieval(
|
||||
tenant_id=dataset.tenant_id,
|
||||
dataset_id=dataset_id,
|
||||
query=query,
|
||||
top_k=top_k,
|
||||
score_threshold=score_threshold,
|
||||
reranking_model=reranking_model,
|
||||
reranking_mode=retrieval_model_config.get("reranking_mode", "reranking_model"),
|
||||
weights=retrieval_model_config.get("weights", None),
|
||||
external_retrieval_parameters=dataset.retrieval_model,
|
||||
)
|
||||
for external_document in external_documents:
|
||||
document = Document(
|
||||
page_content=external_document.get("content"),
|
||||
metadata=external_document.get("metadata"),
|
||||
provider="external",
|
||||
)
|
||||
document.metadata["score"] = external_document.get("score")
|
||||
document.metadata["title"] = external_document.get("title")
|
||||
document.metadata["dataset_id"] = dataset_id
|
||||
document.metadata["dataset_name"] = dataset.name
|
||||
results.append(document)
|
||||
else:
|
||||
retrieval_model_config = dataset.retrieval_model or default_retrieval_model
|
||||
|
||||
# get top k
|
||||
top_k = retrieval_model_config["top_k"]
|
||||
# get retrieval method
|
||||
if dataset.indexing_technique == "economy":
|
||||
retrieval_method = "keyword_search"
|
||||
else:
|
||||
retrieval_method = retrieval_model_config["search_method"]
|
||||
# get reranking model
|
||||
reranking_model = (
|
||||
retrieval_model_config["reranking_model"]
|
||||
if retrieval_model_config["reranking_enable"]
|
||||
else None
|
||||
)
|
||||
# get score threshold
|
||||
score_threshold = 0.0
|
||||
score_threshold_enabled = retrieval_model_config.get("score_threshold_enabled")
|
||||
if score_threshold_enabled:
|
||||
score_threshold = retrieval_model_config.get("score_threshold")
|
||||
|
||||
with measure_time() as timer:
|
||||
results = RetrievalService.retrieve(
|
||||
retrieval_method=retrieval_method,
|
||||
dataset_id=dataset.id,
|
||||
query=query,
|
||||
top_k=top_k,
|
||||
score_threshold=score_threshold,
|
||||
reranking_model=reranking_model,
|
||||
reranking_mode=retrieval_model_config.get("reranking_mode", "reranking_model"),
|
||||
weights=retrieval_model_config.get("weights", None),
|
||||
)
|
||||
self._on_query(query, [dataset_id], app_id, user_from, user_id)
|
||||
|
||||
if results:
|
||||
@@ -356,7 +404,8 @@ class DatasetRetrieval:
|
||||
self, documents: list[Document], message_id: Optional[str] = None, timer: Optional[dict] = None
|
||||
) -> None:
|
||||
"""Handle retrieval end."""
|
||||
for document in documents:
|
||||
dify_documents = [document for document in documents if document.provider == "dify"]
|
||||
for document in dify_documents:
|
||||
query = db.session.query(DocumentSegment).filter(
|
||||
DocumentSegment.index_node_id == document.metadata["doc_id"]
|
||||
)
|
||||
@@ -409,35 +458,54 @@ class DatasetRetrieval:
|
||||
if not dataset:
|
||||
return []
|
||||
|
||||
# get retrieval model , if the model is not setting , using default
|
||||
retrieval_model = dataset.retrieval_model or default_retrieval_model
|
||||
|
||||
if dataset.indexing_technique == "economy":
|
||||
# use keyword table query
|
||||
documents = RetrievalService.retrieve(
|
||||
retrieval_method="keyword_search", dataset_id=dataset.id, query=query, top_k=top_k
|
||||
if dataset.provider == "external":
|
||||
external_documents = ExternalDatasetService.fetch_external_knowledge_retrieval(
|
||||
tenant_id=dataset.tenant_id,
|
||||
dataset_id=dataset_id,
|
||||
query=query,
|
||||
external_retrieval_parameters=dataset.retrieval_model,
|
||||
)
|
||||
if documents:
|
||||
all_documents.extend(documents)
|
||||
else:
|
||||
if top_k > 0:
|
||||
# retrieval source
|
||||
documents = RetrievalService.retrieve(
|
||||
retrieval_method=retrieval_model["search_method"],
|
||||
dataset_id=dataset.id,
|
||||
query=query,
|
||||
top_k=retrieval_model.get("top_k") or 2,
|
||||
score_threshold=retrieval_model.get("score_threshold", 0.0)
|
||||
if retrieval_model["score_threshold_enabled"]
|
||||
else 0.0,
|
||||
reranking_model=retrieval_model.get("reranking_model", None)
|
||||
if retrieval_model["reranking_enable"]
|
||||
else None,
|
||||
reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model",
|
||||
weights=retrieval_model.get("weights", None),
|
||||
for external_document in external_documents:
|
||||
document = Document(
|
||||
page_content=external_document.get("content"),
|
||||
metadata=external_document.get("metadata"),
|
||||
provider="external",
|
||||
)
|
||||
document.metadata["score"] = external_document.get("score")
|
||||
document.metadata["title"] = external_document.get("title")
|
||||
document.metadata["dataset_id"] = dataset_id
|
||||
document.metadata["dataset_name"] = dataset.name
|
||||
all_documents.append(document)
|
||||
else:
|
||||
# get retrieval model , if the model is not setting , using default
|
||||
retrieval_model = dataset.retrieval_model or default_retrieval_model
|
||||
|
||||
all_documents.extend(documents)
|
||||
if dataset.indexing_technique == "economy":
|
||||
# use keyword table query
|
||||
documents = RetrievalService.retrieve(
|
||||
retrieval_method="keyword_search", dataset_id=dataset.id, query=query, top_k=top_k
|
||||
)
|
||||
if documents:
|
||||
all_documents.extend(documents)
|
||||
else:
|
||||
if top_k > 0:
|
||||
# retrieval source
|
||||
documents = RetrievalService.retrieve(
|
||||
retrieval_method=retrieval_model["search_method"],
|
||||
dataset_id=dataset.id,
|
||||
query=query,
|
||||
top_k=retrieval_model.get("top_k") or 2,
|
||||
score_threshold=retrieval_model.get("score_threshold", 0.0)
|
||||
if retrieval_model["score_threshold_enabled"]
|
||||
else 0.0,
|
||||
reranking_model=retrieval_model.get("reranking_model", None)
|
||||
if retrieval_model["reranking_enable"]
|
||||
else None,
|
||||
reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model",
|
||||
weights=retrieval_model.get("weights", None),
|
||||
)
|
||||
|
||||
all_documents.extend(documents)
|
||||
|
||||
def to_dataset_retriever_tool(
|
||||
self,
|
||||
|
||||
7
api/core/tools/provider/builtin/discord/_assets/icon.svg
Normal file
7
api/core/tools/provider/builtin/discord/_assets/icon.svg
Normal file
@@ -0,0 +1,7 @@
|
||||
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
|
||||
|
||||
<!-- Uploaded to: SVG Repo, www.svgrepo.com, Transformed by: SVG Repo Mixer Tools -->
|
||||
<svg width="80px" height="80px" viewBox="0 -28.5 256 256" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" preserveAspectRatio="xMidYMid" fill="#000000">
|
||||
|
||||
<g id="SVGRepo_bgCarrier" stroke-width="0"/>
|
||||
|
||||
|
After Width: | Height: | Size: 2.2 KiB |
9
api/core/tools/provider/builtin/discord/discord.py
Normal file
9
api/core/tools/provider/builtin/discord/discord.py
Normal file
@@ -0,0 +1,9 @@
|
||||
from typing import Any
|
||||
|
||||
from core.tools.provider.builtin.discord.tools.discord_webhook import DiscordWebhookTool
|
||||
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
|
||||
|
||||
|
||||
class DiscordProvider(BuiltinToolProviderController):
|
||||
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
|
||||
DiscordWebhookTool()
|
||||
16
api/core/tools/provider/builtin/discord/discord.yaml
Normal file
16
api/core/tools/provider/builtin/discord/discord.yaml
Normal file
@@ -0,0 +1,16 @@
|
||||
identity:
|
||||
author: Ice Yao
|
||||
name: discord
|
||||
label:
|
||||
en_US: Discord
|
||||
zh_Hans: Discord
|
||||
pt_BR: Discord
|
||||
description:
|
||||
en_US: Discord Webhook
|
||||
zh_Hans: Discord Webhook
|
||||
pt_BR: Discord Webhook
|
||||
icon: icon.svg
|
||||
tags:
|
||||
- social
|
||||
- productivity
|
||||
credentials_for_provider:
|
||||
@@ -0,0 +1,49 @@
|
||||
from typing import Any, Union
|
||||
|
||||
import httpx
|
||||
|
||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||
from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
|
||||
class DiscordWebhookTool(BuiltinTool):
|
||||
def _invoke(
|
||||
self, user_id: str, tool_parameters: dict[str, Any]
|
||||
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
||||
"""
|
||||
Incoming Webhooks
|
||||
API Document:
|
||||
https://discord.com/developers/docs/resources/webhook#execute-webhook
|
||||
"""
|
||||
|
||||
content = tool_parameters.get("content", "")
|
||||
if not content:
|
||||
return self.create_text_message("Invalid parameter content")
|
||||
|
||||
webhook_url = tool_parameters.get("webhook_url", "")
|
||||
|
||||
if not webhook_url.startswith("https://discord.com/api/webhooks/"):
|
||||
return self.create_text_message(
|
||||
f"Invalid parameter webhook_url ${webhook_url}, \
|
||||
not a valid Discord webhook URL"
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
params = {}
|
||||
payload = {
|
||||
"content": content,
|
||||
}
|
||||
|
||||
try:
|
||||
res = httpx.post(webhook_url, headers=headers, params=params, json=payload)
|
||||
if res.is_success:
|
||||
return self.create_text_message("Text message was sent successfully")
|
||||
else:
|
||||
return self.create_text_message(
|
||||
f"Failed to send the text message, \
|
||||
status code: {res.status_code}, response: {res.text}"
|
||||
)
|
||||
except Exception as e:
|
||||
return self.create_text_message("Failed to send message through webhook. {}".format(e))
|
||||
@@ -0,0 +1,40 @@
|
||||
identity:
|
||||
name: discord_webhook
|
||||
author: Ice Yao
|
||||
label:
|
||||
en_US: Incoming Webhook to send message
|
||||
zh_Hans: 通过入站Webhook发送消息
|
||||
pt_BR: Incoming Webhook to send message
|
||||
icon: icon.svg
|
||||
description:
|
||||
human:
|
||||
en_US: Sending a message on Discord via the Incoming Webhook
|
||||
zh_Hans: 通过入站Webhook在Discord上发送消息
|
||||
pt_BR: Sending a message on Discord via the Incoming Webhook
|
||||
llm: A tool for sending messages to a chat on Discord.
|
||||
parameters:
|
||||
- name: webhook_url
|
||||
type: string
|
||||
required: true
|
||||
label:
|
||||
en_US: Discord Incoming Webhook url
|
||||
zh_Hans: Discord入站Webhook的url
|
||||
pt_BR: Discord Incoming Webhook url
|
||||
human_description:
|
||||
en_US: Discord Incoming Webhook url
|
||||
zh_Hans: Discord入站Webhook的url
|
||||
pt_BR: Discord Incoming Webhook url
|
||||
form: form
|
||||
- name: content
|
||||
type: string
|
||||
required: true
|
||||
label:
|
||||
en_US: content
|
||||
zh_Hans: 消息内容
|
||||
pt_BR: content
|
||||
human_description:
|
||||
en_US: Content to sent to the channel or person.
|
||||
zh_Hans: 消息内容文本
|
||||
pt_BR: Content to sent to the channel or person.
|
||||
llm_description: Content of the message
|
||||
form: llm
|
||||
@@ -32,16 +32,17 @@ class StepfunTool(BuiltinTool):
|
||||
prompt = tool_parameters.get("prompt", "")
|
||||
if not prompt:
|
||||
return self.create_text_message("Please input prompt")
|
||||
|
||||
if len(prompt) > 1024:
|
||||
return self.create_text_message("The prompt length should less than 1024")
|
||||
seed = tool_parameters.get("seed", 0)
|
||||
if seed > 0:
|
||||
extra_body["seed"] = seed
|
||||
steps = tool_parameters.get("steps", 0)
|
||||
steps = tool_parameters.get("steps", 50)
|
||||
if steps > 0:
|
||||
extra_body["steps"] = steps
|
||||
negative_prompt = tool_parameters.get("negative_prompt", "")
|
||||
if negative_prompt:
|
||||
extra_body["negative_prompt"] = negative_prompt
|
||||
cfg_scale = tool_parameters.get("cfg_scale", 7.5)
|
||||
if cfg_scale > 0:
|
||||
extra_body["cfg_scale"] = cfg_scale
|
||||
|
||||
# call openapi stepfun model
|
||||
response = client.images.generate(
|
||||
@@ -51,7 +52,6 @@ class StepfunTool(BuiltinTool):
|
||||
n=tool_parameters.get("n", 1),
|
||||
extra_body=extra_body,
|
||||
)
|
||||
print(response)
|
||||
|
||||
result = []
|
||||
for image in response.data:
|
||||
|
||||
@@ -33,9 +33,9 @@ parameters:
|
||||
type: select
|
||||
required: false
|
||||
human_description:
|
||||
en_US: used for selecting the image size
|
||||
zh_Hans: 用于选择图像大小
|
||||
pt_BR: used for selecting the image size
|
||||
en_US: The size of the generated image
|
||||
zh_Hans: 生成的图片大小
|
||||
pt_BR: The size of the generated image
|
||||
label:
|
||||
en_US: Image size
|
||||
zh_Hans: 图像大小
|
||||
@@ -77,17 +77,17 @@ parameters:
|
||||
type: number
|
||||
required: true
|
||||
human_description:
|
||||
en_US: used for selecting the number of images
|
||||
zh_Hans: 用于选择图像数量
|
||||
pt_BR: used for selecting the number of images
|
||||
en_US: Number of generated images, now only one image can be generated at a time
|
||||
zh_Hans: 生成的图像数量,当前仅支持每次生成一张图片
|
||||
pt_BR: Number of generated images, now only one image can be generated at a time
|
||||
label:
|
||||
en_US: Number of images
|
||||
zh_Hans: 图像数量
|
||||
pt_BR: Number of images
|
||||
en_US: Number of generated images
|
||||
zh_Hans: 生成的图像数量
|
||||
pt_BR: Number of generated images
|
||||
form: form
|
||||
default: 1
|
||||
min: 1
|
||||
max: 10
|
||||
max: 1
|
||||
- name: seed
|
||||
type: number
|
||||
required: false
|
||||
@@ -109,21 +109,25 @@ parameters:
|
||||
zh_Hans: Steps
|
||||
pt_BR: Steps
|
||||
human_description:
|
||||
en_US: Steps
|
||||
zh_Hans: Steps
|
||||
pt_BR: Steps
|
||||
en_US: Steps, now support integers between 1 and 100
|
||||
zh_Hans: Steps, 当前支持 1~100 之间整数
|
||||
pt_BR: Steps, now support integers between 1 and 100
|
||||
form: form
|
||||
default: 10
|
||||
- name: negative_prompt
|
||||
type: string
|
||||
default: 50
|
||||
min: 1
|
||||
max: 100
|
||||
- name: cfg_scale
|
||||
type: number
|
||||
required: false
|
||||
label:
|
||||
en_US: Negative prompt
|
||||
zh_Hans: Negative prompt
|
||||
pt_BR: Negative prompt
|
||||
en_US: classifier-free guidance scale
|
||||
zh_Hans: classifier-free guidance scale
|
||||
pt_BR: classifier-free guidance scale
|
||||
human_description:
|
||||
en_US: Negative prompt
|
||||
zh_Hans: Negative prompt
|
||||
pt_BR: Negative prompt
|
||||
en_US: classifier-free guidance scale
|
||||
zh_Hans: classifier-free guidance scale
|
||||
pt_BR: classifier-free guidance scale
|
||||
form: form
|
||||
default: (worst quality:1.3), (nsfw), low quality
|
||||
default: 7.5
|
||||
min: 1
|
||||
max: 10
|
||||
|
||||
@@ -68,10 +68,13 @@ class WorkflowTool(Tool):
|
||||
|
||||
result = []
|
||||
|
||||
outputs = data.get("outputs", {})
|
||||
outputs, files = self._extract_files(outputs)
|
||||
for file in files:
|
||||
result.append(self.create_file_var_message(file))
|
||||
outputs = data.get("outputs")
|
||||
if outputs == None:
|
||||
outputs = {}
|
||||
else:
|
||||
outputs, files = self._extract_files(outputs)
|
||||
for file in files:
|
||||
result.append(self.create_file_var_message(file))
|
||||
|
||||
result.append(self.create_text_message(json.dumps(outputs, ensure_ascii=False)))
|
||||
result.append(self.create_json_message(outputs))
|
||||
|
||||
@@ -156,16 +156,34 @@ class KnowledgeRetrievalNode(BaseNode):
|
||||
weights,
|
||||
node_data.multiple_retrieval_config.reranking_enable,
|
||||
)
|
||||
|
||||
context_list = []
|
||||
if all_documents:
|
||||
dify_documents = [item for item in all_documents if item.provider == "dify"]
|
||||
external_documents = [item for item in all_documents if item.provider == "external"]
|
||||
retrieval_resource_list = []
|
||||
# deal with external documents
|
||||
for item in external_documents:
|
||||
source = {
|
||||
"metadata": {
|
||||
"_source": "knowledge",
|
||||
"dataset_id": item.metadata.get("dataset_id"),
|
||||
"dataset_name": item.metadata.get("dataset_name"),
|
||||
"document_name": item.metadata.get("title"),
|
||||
"data_source_type": "external",
|
||||
"retriever_from": "workflow",
|
||||
"score": item.metadata.get("score"),
|
||||
},
|
||||
"title": item.metadata.get("title"),
|
||||
"content": item.page_content,
|
||||
}
|
||||
retrieval_resource_list.append(source)
|
||||
document_score_list = {}
|
||||
# deal with dify documents
|
||||
if dify_documents:
|
||||
document_score_list = {}
|
||||
page_number_list = {}
|
||||
for item in all_documents:
|
||||
for item in dify_documents:
|
||||
if item.metadata.get("score"):
|
||||
document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
|
||||
|
||||
index_node_ids = [document.metadata["doc_id"] for document in all_documents]
|
||||
index_node_ids = [document.metadata["doc_id"] for document in dify_documents]
|
||||
segments = DocumentSegment.query.filter(
|
||||
DocumentSegment.dataset_id.in_(dataset_ids),
|
||||
DocumentSegment.completed_at.isnot(None),
|
||||
@@ -186,13 +204,10 @@ class KnowledgeRetrievalNode(BaseNode):
|
||||
Document.enabled == True,
|
||||
Document.archived == False,
|
||||
).first()
|
||||
|
||||
resource_number = 1
|
||||
if dataset and document:
|
||||
source = {
|
||||
"metadata": {
|
||||
"_source": "knowledge",
|
||||
"position": resource_number,
|
||||
"dataset_id": dataset.id,
|
||||
"dataset_name": dataset.name,
|
||||
"document_id": document.id,
|
||||
@@ -212,9 +227,16 @@ class KnowledgeRetrievalNode(BaseNode):
|
||||
source["content"] = f"question:{segment.get_sign_content()} \nanswer:{segment.answer}"
|
||||
else:
|
||||
source["content"] = segment.get_sign_content()
|
||||
context_list.append(source)
|
||||
resource_number += 1
|
||||
return context_list
|
||||
retrieval_resource_list.append(source)
|
||||
if retrieval_resource_list:
|
||||
retrieval_resource_list = sorted(
|
||||
retrieval_resource_list, key=lambda x: x.get("metadata").get("score"), reverse=True
|
||||
)
|
||||
position = 1
|
||||
for item in retrieval_resource_list:
|
||||
item["metadata"]["position"] = position
|
||||
position += 1
|
||||
return retrieval_resource_list
|
||||
|
||||
@classmethod
|
||||
def _extract_variable_selector_to_variable_mapping(
|
||||
|
||||
@@ -6,6 +6,7 @@ from flask import Flask
|
||||
|
||||
from extensions.storage.aliyun_storage import AliyunStorage
|
||||
from extensions.storage.azure_storage import AzureStorage
|
||||
from extensions.storage.baidu_storage import BaiduStorage
|
||||
from extensions.storage.google_storage import GoogleStorage
|
||||
from extensions.storage.huawei_storage import HuaweiStorage
|
||||
from extensions.storage.local_storage import LocalStorage
|
||||
@@ -35,6 +36,8 @@ class Storage:
|
||||
self.storage_runner = OCIStorage(app=app)
|
||||
elif storage_type == "huawei-obs":
|
||||
self.storage_runner = HuaweiStorage(app=app)
|
||||
elif storage_type == "baidu-obs":
|
||||
self.storage_runner = BaiduStorage(app=app)
|
||||
elif storage_type == "volcengine-tos":
|
||||
self.storage_runner = VolcengineStorage(app=app)
|
||||
else:
|
||||
|
||||
60
api/extensions/storage/baidu_storage.py
Normal file
60
api/extensions/storage/baidu_storage.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import base64
|
||||
import hashlib
|
||||
from collections.abc import Generator
|
||||
|
||||
from baidubce.auth.bce_credentials import BceCredentials
|
||||
from baidubce.bce_client_configuration import BceClientConfiguration
|
||||
from baidubce.services.bos.bos_client import BosClient
|
||||
from flask import Flask
|
||||
|
||||
from extensions.storage.base_storage import BaseStorage
|
||||
|
||||
|
||||
class BaiduStorage(BaseStorage):
|
||||
"""Implementation for baidu obs storage."""
|
||||
|
||||
def __init__(self, app: Flask):
|
||||
super().__init__(app)
|
||||
app_config = self.app.config
|
||||
self.bucket_name = app_config.get("BAIDU_OBS_BUCKET_NAME")
|
||||
client_config = BceClientConfiguration(
|
||||
credentials=BceCredentials(
|
||||
access_key_id=app_config.get("BAIDU_OBS_ACCESS_KEY"),
|
||||
secret_access_key=app_config.get("BAIDU_OBS_SECRET_KEY"),
|
||||
),
|
||||
endpoint=app_config.get("BAIDU_OBS_ENDPOINT"),
|
||||
)
|
||||
|
||||
self.client = BosClient(config=client_config)
|
||||
|
||||
def save(self, filename, data):
|
||||
md5 = hashlib.md5()
|
||||
md5.update(data)
|
||||
content_md5 = base64.standard_b64encode(md5.digest())
|
||||
self.client.put_object(
|
||||
bucket_name=self.bucket_name, key=filename, data=data, content_length=len(data), content_md5=content_md5
|
||||
)
|
||||
|
||||
def load_once(self, filename: str) -> bytes:
|
||||
response = self.client.get_object(bucket_name=self.bucket_name, key=filename)
|
||||
return response.data.read()
|
||||
|
||||
def load_stream(self, filename: str) -> Generator:
|
||||
def generate(filename: str = filename) -> Generator:
|
||||
response = self.client.get_object(bucket_name=self.bucket_name, key=filename).data
|
||||
while chunk := response.read(4096):
|
||||
yield chunk
|
||||
|
||||
return generate()
|
||||
|
||||
def download(self, filename, target_filepath):
|
||||
self.client.get_object_to_file(bucket_name=self.bucket_name, key=filename, file_name=target_filepath)
|
||||
|
||||
def exists(self, filename):
|
||||
res = self.client.get_object_meta_data(bucket_name=self.bucket_name, key=filename)
|
||||
if res is None:
|
||||
return False
|
||||
return True
|
||||
|
||||
def delete(self, filename):
|
||||
self.client.delete_object(bucket_name=self.bucket_name, key=filename)
|
||||
@@ -29,7 +29,8 @@ class HuaweiStorage(BaseStorage):
|
||||
def load_stream(self, filename: str) -> Generator:
|
||||
def generate(filename: str = filename) -> Generator:
|
||||
response = self.client.getObject(bucketName=self.bucket_name, objectKey=filename)["body"].response
|
||||
yield from response.read(4096)
|
||||
while chunk := response.read(4096):
|
||||
yield chunk
|
||||
|
||||
return generate()
|
||||
|
||||
|
||||
@@ -38,9 +38,20 @@ dataset_retrieval_model_fields = {
|
||||
"score_threshold_enabled": fields.Boolean,
|
||||
"score_threshold": fields.Float,
|
||||
}
|
||||
external_retrieval_model_fields = {
|
||||
"top_k": fields.Integer,
|
||||
"score_threshold": fields.Float,
|
||||
}
|
||||
|
||||
tag_fields = {"id": fields.String, "name": fields.String, "type": fields.String}
|
||||
|
||||
external_knowledge_info_fields = {
|
||||
"external_knowledge_id": fields.String,
|
||||
"external_knowledge_api_id": fields.String,
|
||||
"external_knowledge_api_name": fields.String,
|
||||
"external_knowledge_api_endpoint": fields.String,
|
||||
}
|
||||
|
||||
dataset_detail_fields = {
|
||||
"id": fields.String,
|
||||
"name": fields.String,
|
||||
@@ -61,6 +72,8 @@ dataset_detail_fields = {
|
||||
"embedding_available": fields.Boolean,
|
||||
"retrieval_model_dict": fields.Nested(dataset_retrieval_model_fields),
|
||||
"tags": fields.List(fields.Nested(tag_fields)),
|
||||
"external_knowledge_info": fields.Nested(external_knowledge_info_fields),
|
||||
"external_retrieval_model": fields.Nested(external_retrieval_model_fields, allow_null=True),
|
||||
}
|
||||
|
||||
dataset_query_detail_fields = {
|
||||
|
||||
11
api/fields/external_dataset_fields.py
Normal file
11
api/fields/external_dataset_fields.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from flask_restful import fields
|
||||
|
||||
from libs.helper import TimestampField
|
||||
|
||||
external_knowledge_api_query_detail_fields = {
|
||||
"id": fields.String,
|
||||
"name": fields.String,
|
||||
"setting": fields.String,
|
||||
"created_by": fields.String,
|
||||
"created_at": TimestampField,
|
||||
}
|
||||
@@ -4,25 +4,28 @@ from core.llm_generator.output_parser.errors import OutputParserError
|
||||
|
||||
|
||||
def parse_json_markdown(json_string: str) -> dict:
|
||||
# Remove the triple backticks if present
|
||||
# Get json from the backticks/braces
|
||||
json_string = json_string.strip()
|
||||
start_index = json_string.find("```json")
|
||||
end_index = json_string.find("```", start_index + len("```json"))
|
||||
|
||||
if start_index != -1 and end_index != -1:
|
||||
extracted_content = json_string[start_index + len("```json") : end_index].strip()
|
||||
|
||||
# Parse the JSON string into a Python dictionary
|
||||
starts = ["```json", "```", "``", "`", "{"]
|
||||
ends = ["```", "``", "`", "}"]
|
||||
end_index = -1
|
||||
for s in starts:
|
||||
start_index = json_string.find(s)
|
||||
if start_index != -1:
|
||||
if json_string[start_index] != "{":
|
||||
start_index += len(s)
|
||||
break
|
||||
if start_index != -1:
|
||||
for e in ends:
|
||||
end_index = json_string.rfind(e, start_index)
|
||||
if end_index != -1:
|
||||
if json_string[end_index] == "}":
|
||||
end_index += 1
|
||||
break
|
||||
if start_index != -1 and end_index != -1 and start_index < end_index:
|
||||
extracted_content = json_string[start_index:end_index].strip()
|
||||
print("content:", extracted_content, start_index, end_index)
|
||||
parsed = json.loads(extracted_content)
|
||||
elif start_index != -1 and end_index == -1 and json_string.endswith("``"):
|
||||
end_index = json_string.find("``", start_index + len("```json"))
|
||||
extracted_content = json_string[start_index + len("```json") : end_index].strip()
|
||||
|
||||
# Parse the JSON string into a Python dictionary
|
||||
parsed = json.loads(extracted_content)
|
||||
elif json_string.startswith("{"):
|
||||
# Parse the JSON string into a Python dictionary
|
||||
parsed = json.loads(json_string)
|
||||
else:
|
||||
raise Exception("Could not find JSON block in the output.")
|
||||
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
"""update-retrieval-resource
|
||||
|
||||
Revision ID: 6af6a521a53e
|
||||
Revises: ec3df697ebbb
|
||||
Create Date: 2024-09-24 09:22:43.570120
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import models as models
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '6af6a521a53e'
|
||||
down_revision = 'd57ba9ebb251'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('dataset_retriever_resources', schema=None) as batch_op:
|
||||
batch_op.alter_column('document_id',
|
||||
existing_type=sa.UUID(),
|
||||
nullable=True)
|
||||
batch_op.alter_column('data_source_type',
|
||||
existing_type=sa.TEXT(),
|
||||
nullable=True)
|
||||
batch_op.alter_column('segment_id',
|
||||
existing_type=sa.UUID(),
|
||||
nullable=True)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('dataset_retriever_resources', schema=None) as batch_op:
|
||||
batch_op.alter_column('segment_id',
|
||||
existing_type=sa.UUID(),
|
||||
nullable=False)
|
||||
batch_op.alter_column('data_source_type',
|
||||
existing_type=sa.TEXT(),
|
||||
nullable=False)
|
||||
batch_op.alter_column('document_id',
|
||||
existing_type=sa.UUID(),
|
||||
nullable=False)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
@@ -0,0 +1,73 @@
|
||||
"""external_knowledge_api
|
||||
|
||||
Revision ID: 33f5fac87f29
|
||||
Revises: 6af6a521a53e
|
||||
Create Date: 2024-09-25 04:34:57.249436
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import models as models
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '33f5fac87f29'
|
||||
down_revision = '6af6a521a53e'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('external_knowledge_apis',
|
||||
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
|
||||
sa.Column('name', sa.String(length=255), nullable=False),
|
||||
sa.Column('description', sa.String(length=255), nullable=False),
|
||||
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('settings', sa.Text(), nullable=True),
|
||||
sa.Column('created_by', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id', name='external_knowledge_apis_pkey')
|
||||
)
|
||||
with op.batch_alter_table('external_knowledge_apis', schema=None) as batch_op:
|
||||
batch_op.create_index('external_knowledge_apis_name_idx', ['name'], unique=False)
|
||||
batch_op.create_index('external_knowledge_apis_tenant_idx', ['tenant_id'], unique=False)
|
||||
|
||||
op.create_table('external_knowledge_bindings',
|
||||
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
|
||||
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('external_knowledge_api_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('dataset_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('external_knowledge_id', sa.Text(), nullable=False),
|
||||
sa.Column('created_by', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id', name='external_knowledge_bindings_pkey')
|
||||
)
|
||||
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
|
||||
batch_op.create_index('external_knowledge_bindings_dataset_idx', ['dataset_id'], unique=False)
|
||||
batch_op.create_index('external_knowledge_bindings_external_knowledge_api_idx', ['external_knowledge_api_id'], unique=False)
|
||||
batch_op.create_index('external_knowledge_bindings_external_knowledge_idx', ['external_knowledge_id'], unique=False)
|
||||
batch_op.create_index('external_knowledge_bindings_tenant_idx', ['tenant_id'], unique=False)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
|
||||
batch_op.drop_index('external_knowledge_bindings_tenant_idx')
|
||||
batch_op.drop_index('external_knowledge_bindings_external_knowledge_idx')
|
||||
batch_op.drop_index('external_knowledge_bindings_external_knowledge_api_idx')
|
||||
batch_op.drop_index('external_knowledge_bindings_dataset_idx')
|
||||
|
||||
op.drop_table('external_knowledge_bindings')
|
||||
with op.batch_alter_table('external_knowledge_apis', schema=None) as batch_op:
|
||||
batch_op.drop_index('external_knowledge_apis_tenant_idx')
|
||||
batch_op.drop_index('external_knowledge_apis_name_idx')
|
||||
|
||||
op.drop_table('external_knowledge_apis')
|
||||
# ### end Alembic commands ###
|
||||
@@ -0,0 +1,48 @@
|
||||
"""fix wrong service-api history
|
||||
|
||||
Revision ID: d8e744d88ed6
|
||||
Revises: 33f5fac87f29
|
||||
Create Date: 2024-10-09 13:29:23.548498
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
from constants import UUID_NIL
|
||||
import models as models
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = 'd8e744d88ed6'
|
||||
down_revision = '33f5fac87f29'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
# (UTC) release date of v0.9.0
|
||||
v0_9_0_release_date= '2024-09-29 12:00:00'
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
sql = f"""UPDATE
|
||||
public.messages
|
||||
SET
|
||||
parent_message_id = '{UUID_NIL}'
|
||||
WHERE
|
||||
invoke_from = 'service-api'
|
||||
AND parent_message_id IS NULL
|
||||
AND created_at >= '{v0_9_0_release_date}';"""
|
||||
op.execute(sql)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
sql = f"""UPDATE
|
||||
public.messages
|
||||
SET
|
||||
parent_message_id = NULL
|
||||
WHERE
|
||||
invoke_from = 'service-api'
|
||||
AND parent_message_id = '{UUID_NIL}'
|
||||
AND created_at >= '{v0_9_0_release_date}';"""
|
||||
op.execute(sql)
|
||||
# ### end Alembic commands ###
|
||||
@@ -1,4 +1,4 @@
|
||||
"""add-dataset-retrival-model
|
||||
"""add-dataset-retrieval-model
|
||||
|
||||
Revision ID: fca025d3b60f
|
||||
Revises: b3a09c049e8e
|
||||
|
||||
@@ -38,6 +38,7 @@ class Dataset(db.Model):
|
||||
)
|
||||
|
||||
INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
|
||||
PROVIDER_LIST = ["vendor", "external", None]
|
||||
|
||||
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
|
||||
tenant_id = db.Column(StringUUID, nullable=False)
|
||||
@@ -71,6 +72,14 @@ class Dataset(db.Model):
|
||||
def index_struct_dict(self):
|
||||
return json.loads(self.index_struct) if self.index_struct else None
|
||||
|
||||
@property
|
||||
def external_retrieval_model(self):
|
||||
default_retrieval_model = {
|
||||
"top_k": 2,
|
||||
"score_threshold": 0.0,
|
||||
}
|
||||
return self.retrieval_model or default_retrieval_model
|
||||
|
||||
@property
|
||||
def created_by_account(self):
|
||||
return db.session.get(Account, self.created_by)
|
||||
@@ -162,6 +171,29 @@ class Dataset(db.Model):
|
||||
|
||||
return tags or []
|
||||
|
||||
@property
|
||||
def external_knowledge_info(self):
|
||||
if self.provider != "external":
|
||||
return None
|
||||
external_knowledge_binding = (
|
||||
db.session.query(ExternalKnowledgeBindings).filter(ExternalKnowledgeBindings.dataset_id == self.id).first()
|
||||
)
|
||||
if not external_knowledge_binding:
|
||||
return None
|
||||
external_knowledge_api = (
|
||||
db.session.query(ExternalKnowledgeApis)
|
||||
.filter(ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id)
|
||||
.first()
|
||||
)
|
||||
if not external_knowledge_api:
|
||||
return None
|
||||
return {
|
||||
"external_knowledge_id": external_knowledge_binding.external_knowledge_id,
|
||||
"external_knowledge_api_id": external_knowledge_api.id,
|
||||
"external_knowledge_api_name": external_knowledge_api.name,
|
||||
"external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def gen_collection_name_by_id(dataset_id: str) -> str:
|
||||
normalized_dataset_id = dataset_id.replace("-", "_")
|
||||
@@ -687,3 +719,77 @@ class DatasetPermission(db.Model):
|
||||
tenant_id = db.Column(StringUUID, nullable=False)
|
||||
has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
|
||||
|
||||
class ExternalKnowledgeApis(db.Model):
|
||||
__tablename__ = "external_knowledge_apis"
|
||||
__table_args__ = (
|
||||
db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
|
||||
db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
|
||||
db.Index("external_knowledge_apis_name_idx", "name"),
|
||||
)
|
||||
|
||||
id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
|
||||
name = db.Column(db.String(255), nullable=False)
|
||||
description = db.Column(db.String(255), nullable=False)
|
||||
tenant_id = db.Column(StringUUID, nullable=False)
|
||||
settings = db.Column(db.Text, nullable=True)
|
||||
created_by = db.Column(StringUUID, nullable=False)
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
updated_by = db.Column(StringUUID, nullable=True)
|
||||
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"id": self.id,
|
||||
"tenant_id": self.tenant_id,
|
||||
"name": self.name,
|
||||
"description": self.description,
|
||||
"settings": self.settings_dict,
|
||||
"dataset_bindings": self.dataset_bindings,
|
||||
"created_by": self.created_by,
|
||||
"created_at": self.created_at.isoformat(),
|
||||
}
|
||||
|
||||
@property
|
||||
def settings_dict(self):
|
||||
try:
|
||||
return json.loads(self.settings) if self.settings else None
|
||||
except JSONDecodeError:
|
||||
return None
|
||||
|
||||
@property
|
||||
def dataset_bindings(self):
|
||||
external_knowledge_bindings = (
|
||||
db.session.query(ExternalKnowledgeBindings)
|
||||
.filter(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
|
||||
.all()
|
||||
)
|
||||
dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
|
||||
datasets = db.session.query(Dataset).filter(Dataset.id.in_(dataset_ids)).all()
|
||||
dataset_bindings = []
|
||||
for dataset in datasets:
|
||||
dataset_bindings.append({"id": dataset.id, "name": dataset.name})
|
||||
|
||||
return dataset_bindings
|
||||
|
||||
|
||||
class ExternalKnowledgeBindings(db.Model):
|
||||
__tablename__ = "external_knowledge_bindings"
|
||||
__table_args__ = (
|
||||
db.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
|
||||
db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
|
||||
db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
|
||||
db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
|
||||
db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
|
||||
)
|
||||
|
||||
id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
|
||||
tenant_id = db.Column(StringUUID, nullable=False)
|
||||
external_knowledge_api_id = db.Column(StringUUID, nullable=False)
|
||||
dataset_id = db.Column(StringUUID, nullable=False)
|
||||
external_knowledge_id = db.Column(db.Text, nullable=False)
|
||||
created_by = db.Column(StringUUID, nullable=False)
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
updated_by = db.Column(StringUUID, nullable=True)
|
||||
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
|
||||
@@ -1423,10 +1423,10 @@ class DatasetRetrieverResource(db.Model):
|
||||
position = db.Column(db.Integer, nullable=False)
|
||||
dataset_id = db.Column(StringUUID, nullable=False)
|
||||
dataset_name = db.Column(db.Text, nullable=False)
|
||||
document_id = db.Column(StringUUID, nullable=False)
|
||||
document_id = db.Column(StringUUID, nullable=True)
|
||||
document_name = db.Column(db.Text, nullable=False)
|
||||
data_source_type = db.Column(db.Text, nullable=False)
|
||||
segment_id = db.Column(StringUUID, nullable=False)
|
||||
data_source_type = db.Column(db.Text, nullable=True)
|
||||
segment_id = db.Column(StringUUID, nullable=True)
|
||||
score = db.Column(db.Float, nullable=True)
|
||||
content = db.Column(db.Text, nullable=False)
|
||||
hit_count = db.Column(db.Integer, nullable=True)
|
||||
|
||||
1770
api/poetry.lock
generated
1770
api/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -148,6 +148,7 @@ chardet = "~5.1.0"
|
||||
cohere = "~5.2.4"
|
||||
cos-python-sdk-v5 = "1.9.30"
|
||||
esdk-obs-python = "3.24.6.1"
|
||||
bce-python-sdk = "~0.9.23"
|
||||
dashscope = { version = "~1.17.0", extras = ["tokenizer"] }
|
||||
flask = "~3.0.1"
|
||||
flask-compress = "~1.14"
|
||||
@@ -221,6 +222,7 @@ volcengine-python-sdk = {extras = ["ark"], version = "^1.0.98"}
|
||||
oci = "^2.133.0"
|
||||
tos = "^2.7.1"
|
||||
nomic = "^3.1.2"
|
||||
validators = "0.21.0"
|
||||
[tool.poetry.group.indriect.dependencies]
|
||||
kaleido = "0.2.1"
|
||||
rank-bm25 = "~0.2.2"
|
||||
@@ -235,7 +237,7 @@ arxiv = "2.1.0"
|
||||
cloudscraper = "1.2.71"
|
||||
matplotlib = "~3.8.2"
|
||||
newspaper3k = "0.2.8"
|
||||
duckduckgo-search = "^6.2.6"
|
||||
duckduckgo-search = "~6.3.0"
|
||||
jsonpath-ng = "1.6.1"
|
||||
numexpr = "~2.9.0"
|
||||
opensearch-py = "2.4.0"
|
||||
|
||||
92
api/schedule/clean_unused_messages_task.py
Normal file
92
api/schedule/clean_unused_messages_task.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import datetime
|
||||
import time
|
||||
|
||||
import click
|
||||
from sqlalchemy import func
|
||||
from werkzeug.exceptions import NotFound
|
||||
|
||||
import app
|
||||
from configs import dify_config
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset, DatasetQuery, Document
|
||||
|
||||
|
||||
@app.celery.task(queue="dataset")
|
||||
def clean_unused_message_task():
|
||||
click.echo(click.style("Start clean unused messages .", fg="green"))
|
||||
clean_days = int(dify_config.CLEAN_DAY_SETTING)
|
||||
start_at = time.perf_counter()
|
||||
thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=clean_days)
|
||||
page = 1
|
||||
while True:
|
||||
try:
|
||||
# Subquery for counting new documents
|
||||
document_subquery_new = (
|
||||
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
|
||||
.filter(
|
||||
Document.indexing_status == "completed",
|
||||
Document.enabled == True,
|
||||
Document.archived == False,
|
||||
Document.updated_at > thirty_days_ago,
|
||||
)
|
||||
.group_by(Document.dataset_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# Subquery for counting old documents
|
||||
document_subquery_old = (
|
||||
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
|
||||
.filter(
|
||||
Document.indexing_status == "completed",
|
||||
Document.enabled == True,
|
||||
Document.archived == False,
|
||||
Document.updated_at < thirty_days_ago,
|
||||
)
|
||||
.group_by(Document.dataset_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# Main query with join and filter
|
||||
datasets = (
|
||||
db.session.query(Dataset)
|
||||
.outerjoin(document_subquery_new, Dataset.id == document_subquery_new.c.dataset_id)
|
||||
.outerjoin(document_subquery_old, Dataset.id == document_subquery_old.c.dataset_id)
|
||||
.filter(
|
||||
Dataset.created_at < thirty_days_ago,
|
||||
func.coalesce(document_subquery_new.c.document_count, 0) == 0,
|
||||
func.coalesce(document_subquery_old.c.document_count, 0) > 0,
|
||||
)
|
||||
.order_by(Dataset.created_at.desc())
|
||||
.paginate(page=page, per_page=50)
|
||||
)
|
||||
|
||||
except NotFound:
|
||||
break
|
||||
if datasets.items is None or len(datasets.items) == 0:
|
||||
break
|
||||
page += 1
|
||||
for dataset in datasets:
|
||||
dataset_query = (
|
||||
db.session.query(DatasetQuery)
|
||||
.filter(DatasetQuery.created_at > thirty_days_ago, DatasetQuery.dataset_id == dataset.id)
|
||||
.all()
|
||||
)
|
||||
if not dataset_query or len(dataset_query) == 0:
|
||||
try:
|
||||
# remove index
|
||||
index_processor = IndexProcessorFactory(dataset.doc_form).init_index_processor()
|
||||
index_processor.clean(dataset, None)
|
||||
|
||||
# update document
|
||||
update_params = {Document.enabled: False}
|
||||
|
||||
Document.query.filter_by(dataset_id=dataset.id).update(update_params)
|
||||
db.session.commit()
|
||||
click.echo(click.style("Cleaned unused dataset {} from db success!".format(dataset.id), fg="green"))
|
||||
except Exception as e:
|
||||
click.echo(
|
||||
click.style("clean dataset index error: {} {}".format(e.__class__.__name__, str(e)), fg="red")
|
||||
)
|
||||
end_at = time.perf_counter()
|
||||
click.echo(click.style("Cleaned unused dataset from db success latency: {}".format(end_at - start_at), fg="green"))
|
||||
@@ -1,10 +1,13 @@
|
||||
from services.auth.firecrawl import FirecrawlAuth
|
||||
from services.auth.jina import JinaAuth
|
||||
|
||||
|
||||
class ApiKeyAuthFactory:
|
||||
def __init__(self, provider: str, credentials: dict):
|
||||
if provider == "firecrawl":
|
||||
self.auth = FirecrawlAuth(credentials)
|
||||
elif provider == "jinareader":
|
||||
self.auth = JinaAuth(credentials)
|
||||
else:
|
||||
raise ValueError("Invalid provider")
|
||||
|
||||
|
||||
44
api/services/auth/jina.py
Normal file
44
api/services/auth/jina.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import json
|
||||
|
||||
import requests
|
||||
|
||||
from services.auth.api_key_auth_base import ApiKeyAuthBase
|
||||
|
||||
|
||||
class JinaAuth(ApiKeyAuthBase):
|
||||
def __init__(self, credentials: dict):
|
||||
super().__init__(credentials)
|
||||
auth_type = credentials.get("auth_type")
|
||||
if auth_type != "bearer":
|
||||
raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer")
|
||||
self.api_key = credentials.get("config").get("api_key", None)
|
||||
|
||||
if not self.api_key:
|
||||
raise ValueError("No API key provided")
|
||||
|
||||
def validate_credentials(self):
|
||||
headers = self._prepare_headers()
|
||||
options = {
|
||||
"url": "https://example.com",
|
||||
}
|
||||
response = self._post_request("https://r.jina.ai", options, headers)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
self._handle_error(response)
|
||||
|
||||
def _prepare_headers(self):
|
||||
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
||||
|
||||
def _post_request(self, url, data, headers):
|
||||
return requests.post(url, headers=headers, json=data)
|
||||
|
||||
def _handle_error(self, response):
|
||||
if response.status_code in {402, 409, 500}:
|
||||
error_message = response.json().get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||
else:
|
||||
if response.text:
|
||||
error_message = json.loads(response.text).get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")
|
||||
@@ -32,6 +32,7 @@ from models.dataset import (
|
||||
DatasetQuery,
|
||||
Document,
|
||||
DocumentSegment,
|
||||
ExternalKnowledgeBindings,
|
||||
)
|
||||
from models.model import UploadFile
|
||||
from models.source import DataSourceOauthBinding
|
||||
@@ -39,6 +40,7 @@ from services.errors.account import NoPermissionError
|
||||
from services.errors.dataset import DatasetNameDuplicateError
|
||||
from services.errors.document import DocumentIndexingError
|
||||
from services.errors.file import FileNotExistsError
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
from services.feature_service import FeatureModel, FeatureService
|
||||
from services.tag_service import TagService
|
||||
from services.vector_service import VectorService
|
||||
@@ -56,10 +58,8 @@ from tasks.sync_website_document_indexing_task import sync_website_document_inde
|
||||
|
||||
class DatasetService:
|
||||
@staticmethod
|
||||
def get_datasets(page, per_page, provider="vendor", tenant_id=None, user=None, search=None, tag_ids=None):
|
||||
query = Dataset.query.filter(Dataset.provider == provider, Dataset.tenant_id == tenant_id).order_by(
|
||||
Dataset.created_at.desc()
|
||||
)
|
||||
def get_datasets(page, per_page, tenant_id=None, user=None, search=None, tag_ids=None):
|
||||
query = Dataset.query.filter(Dataset.tenant_id == tenant_id).order_by(Dataset.created_at.desc())
|
||||
|
||||
if user:
|
||||
# get permitted dataset ids
|
||||
@@ -137,7 +137,14 @@ class DatasetService:
|
||||
|
||||
@staticmethod
|
||||
def create_empty_dataset(
|
||||
tenant_id: str, name: str, indexing_technique: Optional[str], account: Account, permission: Optional[str] = None
|
||||
tenant_id: str,
|
||||
name: str,
|
||||
indexing_technique: Optional[str],
|
||||
account: Account,
|
||||
permission: Optional[str] = None,
|
||||
provider: str = "vendor",
|
||||
external_knowledge_api_id: Optional[str] = None,
|
||||
external_knowledge_id: Optional[str] = None,
|
||||
):
|
||||
# check if dataset name already exists
|
||||
if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first():
|
||||
@@ -156,12 +163,28 @@ class DatasetService:
|
||||
dataset.embedding_model_provider = embedding_model.provider if embedding_model else None
|
||||
dataset.embedding_model = embedding_model.model if embedding_model else None
|
||||
dataset.permission = permission or DatasetPermissionEnum.ONLY_ME
|
||||
dataset.provider = provider
|
||||
db.session.add(dataset)
|
||||
db.session.flush()
|
||||
|
||||
if provider == "external" and external_knowledge_api_id:
|
||||
external_knowledge_api = ExternalDatasetService.get_external_knowledge_api(external_knowledge_api_id)
|
||||
if not external_knowledge_api:
|
||||
raise ValueError("External API template not found.")
|
||||
external_knowledge_binding = ExternalKnowledgeBindings(
|
||||
tenant_id=tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
external_knowledge_api_id=external_knowledge_api_id,
|
||||
external_knowledge_id=external_knowledge_id,
|
||||
created_by=account.id,
|
||||
)
|
||||
db.session.add(external_knowledge_binding)
|
||||
|
||||
db.session.commit()
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def get_dataset(dataset_id):
|
||||
def get_dataset(dataset_id) -> Dataset:
|
||||
return Dataset.query.filter_by(id=dataset_id).first()
|
||||
|
||||
@staticmethod
|
||||
@@ -202,81 +225,106 @@ class DatasetService:
|
||||
|
||||
@staticmethod
|
||||
def update_dataset(dataset_id, data, user):
|
||||
data.pop("partial_member_list", None)
|
||||
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
|
||||
dataset = DatasetService.get_dataset(dataset_id)
|
||||
|
||||
DatasetService.check_dataset_permission(dataset, user)
|
||||
action = None
|
||||
if dataset.indexing_technique != data["indexing_technique"]:
|
||||
# if update indexing_technique
|
||||
if data["indexing_technique"] == "economy":
|
||||
action = "remove"
|
||||
filtered_data["embedding_model"] = None
|
||||
filtered_data["embedding_model_provider"] = None
|
||||
filtered_data["collection_binding_id"] = None
|
||||
elif data["indexing_technique"] == "high_quality":
|
||||
action = "add"
|
||||
# get embedding model setting
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
else:
|
||||
if dataset.provider == "external":
|
||||
dataset.retrieval_model = data.get("external_retrieval_model", None)
|
||||
dataset.name = data.get("name", dataset.name)
|
||||
dataset.description = data.get("description", "")
|
||||
external_knowledge_id = data.get("external_knowledge_id", None)
|
||||
db.session.add(dataset)
|
||||
if not external_knowledge_id:
|
||||
raise ValueError("External knowledge id is required.")
|
||||
external_knowledge_api_id = data.get("external_knowledge_api_id", None)
|
||||
if not external_knowledge_api_id:
|
||||
raise ValueError("External knowledge api id is required.")
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(dataset_id=dataset_id).first()
|
||||
if (
|
||||
data["embedding_model_provider"] != dataset.embedding_model_provider
|
||||
or data["embedding_model"] != dataset.embedding_model
|
||||
external_knowledge_binding.external_knowledge_id != external_knowledge_id
|
||||
or external_knowledge_binding.external_knowledge_api_id != external_knowledge_api_id
|
||||
):
|
||||
action = "update"
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
external_knowledge_binding.external_knowledge_id = external_knowledge_id
|
||||
external_knowledge_binding.external_knowledge_api_id = external_knowledge_api_id
|
||||
db.session.add(external_knowledge_binding)
|
||||
db.session.commit()
|
||||
else:
|
||||
data.pop("partial_member_list", None)
|
||||
data.pop("external_knowledge_api_id", None)
|
||||
data.pop("external_knowledge_id", None)
|
||||
data.pop("external_retrieval_model", None)
|
||||
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
|
||||
action = None
|
||||
if dataset.indexing_technique != data["indexing_technique"]:
|
||||
# if update indexing_technique
|
||||
if data["indexing_technique"] == "economy":
|
||||
action = "remove"
|
||||
filtered_data["embedding_model"] = None
|
||||
filtered_data["embedding_model_provider"] = None
|
||||
filtered_data["collection_binding_id"] = None
|
||||
elif data["indexing_technique"] == "high_quality":
|
||||
action = "add"
|
||||
# get embedding model setting
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
else:
|
||||
if (
|
||||
data["embedding_model_provider"] != dataset.embedding_model_provider
|
||||
or data["embedding_model"] != dataset.embedding_model
|
||||
):
|
||||
action = "update"
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
|
||||
filtered_data["updated_by"] = user.id
|
||||
filtered_data["updated_at"] = datetime.datetime.now()
|
||||
filtered_data["updated_by"] = user.id
|
||||
filtered_data["updated_at"] = datetime.datetime.now()
|
||||
|
||||
# update Retrieval model
|
||||
filtered_data["retrieval_model"] = data["retrieval_model"]
|
||||
# update Retrieval model
|
||||
filtered_data["retrieval_model"] = data["retrieval_model"]
|
||||
|
||||
dataset.query.filter_by(id=dataset_id).update(filtered_data)
|
||||
dataset.query.filter_by(id=dataset_id).update(filtered_data)
|
||||
|
||||
db.session.commit()
|
||||
if action:
|
||||
deal_dataset_vector_index_task.delay(dataset_id, action)
|
||||
db.session.commit()
|
||||
if action:
|
||||
deal_dataset_vector_index_task.delay(dataset_id, action)
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
from typing import Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class AuthorizationConfig(BaseModel):
|
||||
type: Literal[None, "basic", "bearer", "custom"]
|
||||
api_key: Union[None, str] = None
|
||||
header: Union[None, str] = None
|
||||
|
||||
|
||||
class Authorization(BaseModel):
|
||||
type: Literal["no-auth", "api-key"]
|
||||
config: Optional[AuthorizationConfig] = None
|
||||
|
||||
|
||||
class ProcessStatusSetting(BaseModel):
|
||||
request_method: str
|
||||
url: str
|
||||
|
||||
|
||||
class ExternalKnowledgeApiSetting(BaseModel):
|
||||
url: str
|
||||
request_method: str
|
||||
headers: Optional[dict] = None
|
||||
params: Optional[dict] = None
|
||||
274
api/services/external_knowledge_service.py
Normal file
274
api/services/external_knowledge_service.py
Normal file
@@ -0,0 +1,274 @@
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import httpx
|
||||
import validators
|
||||
|
||||
# from tasks.external_document_indexing_task import external_document_indexing_task
|
||||
from core.helper import ssrf_proxy
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import (
|
||||
Dataset,
|
||||
ExternalKnowledgeApis,
|
||||
ExternalKnowledgeBindings,
|
||||
)
|
||||
from services.entities.external_knowledge_entities.external_knowledge_entities import (
|
||||
Authorization,
|
||||
ExternalKnowledgeApiSetting,
|
||||
)
|
||||
from services.errors.dataset import DatasetNameDuplicateError
|
||||
|
||||
|
||||
class ExternalDatasetService:
|
||||
@staticmethod
|
||||
def get_external_knowledge_apis(page, per_page, tenant_id, search=None) -> tuple[list[ExternalKnowledgeApis], int]:
|
||||
query = ExternalKnowledgeApis.query.filter(ExternalKnowledgeApis.tenant_id == tenant_id).order_by(
|
||||
ExternalKnowledgeApis.created_at.desc()
|
||||
)
|
||||
if search:
|
||||
query = query.filter(ExternalKnowledgeApis.name.ilike(f"%{search}%"))
|
||||
|
||||
external_knowledge_apis = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
|
||||
|
||||
return external_knowledge_apis.items, external_knowledge_apis.total
|
||||
|
||||
@classmethod
|
||||
def validate_api_list(cls, api_settings: dict):
|
||||
if not api_settings:
|
||||
raise ValueError("api list is empty")
|
||||
if "endpoint" not in api_settings and not api_settings["endpoint"]:
|
||||
raise ValueError("endpoint is required")
|
||||
if "api_key" not in api_settings and not api_settings["api_key"]:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
@staticmethod
|
||||
def create_external_knowledge_api(tenant_id: str, user_id: str, args: dict) -> ExternalKnowledgeApis:
|
||||
ExternalDatasetService.check_endpoint_and_api_key(args.get("settings"))
|
||||
external_knowledge_api = ExternalKnowledgeApis(
|
||||
tenant_id=tenant_id,
|
||||
created_by=user_id,
|
||||
updated_by=user_id,
|
||||
name=args.get("name"),
|
||||
description=args.get("description", ""),
|
||||
settings=json.dumps(args.get("settings"), ensure_ascii=False),
|
||||
)
|
||||
|
||||
db.session.add(external_knowledge_api)
|
||||
db.session.commit()
|
||||
return external_knowledge_api
|
||||
|
||||
@staticmethod
|
||||
def check_endpoint_and_api_key(settings: dict):
|
||||
if "endpoint" not in settings or not settings["endpoint"]:
|
||||
raise ValueError("endpoint is required")
|
||||
if "api_key" not in settings or not settings["api_key"]:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
endpoint = f"{settings['endpoint']}/retrieval"
|
||||
api_key = settings["api_key"]
|
||||
if not validators.url(endpoint):
|
||||
raise ValueError(f"invalid endpoint: {endpoint}")
|
||||
try:
|
||||
response = httpx.post(endpoint, headers={"Authorization": f"Bearer {api_key}"})
|
||||
except Exception as e:
|
||||
raise ValueError(f"failed to connect to the endpoint: {endpoint}")
|
||||
if response.status_code == 502:
|
||||
raise ValueError(f"Bad Gateway: failed to connect to the endpoint: {endpoint}")
|
||||
if response.status_code == 404:
|
||||
raise ValueError(f"Not Found: failed to connect to the endpoint: {endpoint}")
|
||||
if response.status_code == 403:
|
||||
raise ValueError(f"Forbidden: Authorization failed with api_key: {api_key}")
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_api(external_knowledge_api_id: str) -> ExternalKnowledgeApis:
|
||||
return ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id).first()
|
||||
|
||||
@staticmethod
|
||||
def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis:
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_api_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
external_knowledge_api.name = args.get("name")
|
||||
external_knowledge_api.description = args.get("description", "")
|
||||
external_knowledge_api.settings = json.dumps(args.get("settings"), ensure_ascii=False)
|
||||
external_knowledge_api.updated_by = user_id
|
||||
external_knowledge_api.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
|
||||
db.session.commit()
|
||||
|
||||
return external_knowledge_api
|
||||
|
||||
@staticmethod
|
||||
def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str):
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_api_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
db.session.delete(external_knowledge_api)
|
||||
db.session.commit()
|
||||
|
||||
@staticmethod
|
||||
def external_knowledge_api_use_check(external_knowledge_api_id: str) -> tuple[bool, int]:
|
||||
count = ExternalKnowledgeBindings.query.filter_by(external_knowledge_api_id=external_knowledge_api_id).count()
|
||||
if count > 0:
|
||||
return True, count
|
||||
return False, 0
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||
dataset_id=dataset_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if not external_knowledge_binding:
|
||||
raise ValueError("external knowledge binding not found")
|
||||
return external_knowledge_binding
|
||||
|
||||
@staticmethod
|
||||
def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict):
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_api_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
settings = json.loads(external_knowledge_api.settings)
|
||||
for setting in settings:
|
||||
custom_parameters = setting.get("document_process_setting")
|
||||
if custom_parameters:
|
||||
for parameter in custom_parameters:
|
||||
if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
|
||||
raise ValueError(f'{parameter.get("name")} is required')
|
||||
|
||||
@staticmethod
|
||||
def process_external_api(
|
||||
settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]]
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
do http request depending on api bundle
|
||||
"""
|
||||
|
||||
kwargs = {
|
||||
"url": settings.url,
|
||||
"headers": settings.headers,
|
||||
"follow_redirects": True,
|
||||
}
|
||||
|
||||
response = getattr(ssrf_proxy, settings.request_method)(data=json.dumps(settings.params), files=files, **kwargs)
|
||||
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]:
|
||||
authorization = deepcopy(authorization)
|
||||
if headers:
|
||||
headers = deepcopy(headers)
|
||||
else:
|
||||
headers = {}
|
||||
if authorization.type == "api-key":
|
||||
if authorization.config is None:
|
||||
raise ValueError("authorization config is required")
|
||||
|
||||
if authorization.config.api_key is None:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
if not authorization.config.header:
|
||||
authorization.config.header = "Authorization"
|
||||
|
||||
if authorization.config.type == "bearer":
|
||||
headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
|
||||
elif authorization.config.type == "basic":
|
||||
headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
|
||||
elif authorization.config.type == "custom":
|
||||
headers[authorization.config.header] = authorization.config.api_key
|
||||
|
||||
return headers
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_api_settings(settings: dict) -> ExternalKnowledgeApiSetting:
|
||||
return ExternalKnowledgeApiSetting.parse_obj(settings)
|
||||
|
||||
@staticmethod
|
||||
def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
|
||||
# check if dataset name already exists
|
||||
if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
|
||||
raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=args.get("external_knowledge_api_id"), tenant_id=tenant_id
|
||||
).first()
|
||||
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
dataset = Dataset(
|
||||
tenant_id=tenant_id,
|
||||
name=args.get("name"),
|
||||
description=args.get("description", ""),
|
||||
provider="external",
|
||||
retrieval_model=args.get("external_retrieval_model"),
|
||||
created_by=user_id,
|
||||
)
|
||||
|
||||
db.session.add(dataset)
|
||||
db.session.flush()
|
||||
|
||||
external_knowledge_binding = ExternalKnowledgeBindings(
|
||||
tenant_id=tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
external_knowledge_api_id=args.get("external_knowledge_api_id"),
|
||||
external_knowledge_id=args.get("external_knowledge_id"),
|
||||
created_by=user_id,
|
||||
)
|
||||
db.session.add(external_knowledge_binding)
|
||||
|
||||
db.session.commit()
|
||||
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def fetch_external_knowledge_retrieval(
|
||||
tenant_id: str, dataset_id: str, query: str, external_retrieval_parameters: dict
|
||||
) -> list:
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||
dataset_id=dataset_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if not external_knowledge_binding:
|
||||
raise ValueError("external knowledge binding not found")
|
||||
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_binding.external_knowledge_api_id
|
||||
).first()
|
||||
if not external_knowledge_api:
|
||||
raise ValueError("external api template not found")
|
||||
|
||||
settings = json.loads(external_knowledge_api.settings)
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if settings.get("api_key"):
|
||||
headers["Authorization"] = f"Bearer {settings.get('api_key')}"
|
||||
score_threshold_enabled = external_retrieval_parameters.get("score_threshold_enabled") or False
|
||||
score_threshold = external_retrieval_parameters.get("score_threshold", 0.0) if score_threshold_enabled else 0.0
|
||||
request_params = {
|
||||
"retrieval_setting": {
|
||||
"top_k": external_retrieval_parameters.get("top_k"),
|
||||
"score_threshold": score_threshold,
|
||||
},
|
||||
"query": query,
|
||||
"knowledge_id": external_knowledge_binding.external_knowledge_id,
|
||||
}
|
||||
|
||||
external_knowledge_api_setting = {
|
||||
"url": f"{settings.get('endpoint')}/retrieval",
|
||||
"request_method": "post",
|
||||
"headers": headers,
|
||||
"params": request_params,
|
||||
}
|
||||
response = ExternalDatasetService.process_external_api(
|
||||
ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json().get("records", [])
|
||||
return []
|
||||
@@ -19,7 +19,15 @@ default_retrieval_model = {
|
||||
|
||||
class HitTestingService:
|
||||
@classmethod
|
||||
def retrieve(cls, dataset: Dataset, query: str, account: Account, retrieval_model: dict, limit: int = 10) -> dict:
|
||||
def retrieve(
|
||||
cls,
|
||||
dataset: Dataset,
|
||||
query: str,
|
||||
account: Account,
|
||||
retrieval_model: dict,
|
||||
external_retrieval_model: dict,
|
||||
limit: int = 10,
|
||||
) -> dict:
|
||||
if dataset.available_document_count == 0 or dataset.available_segment_count == 0:
|
||||
return {
|
||||
"query": {
|
||||
@@ -62,10 +70,44 @@ class HitTestingService:
|
||||
|
||||
return cls.compact_retrieve_response(dataset, query, all_documents)
|
||||
|
||||
@classmethod
|
||||
def external_retrieve(
|
||||
cls,
|
||||
dataset: Dataset,
|
||||
query: str,
|
||||
account: Account,
|
||||
external_retrieval_model: dict,
|
||||
) -> dict:
|
||||
if dataset.provider != "external":
|
||||
return {
|
||||
"query": {"content": query},
|
||||
"records": [],
|
||||
}
|
||||
|
||||
start = time.perf_counter()
|
||||
|
||||
all_documents = RetrievalService.external_retrieve(
|
||||
dataset_id=dataset.id,
|
||||
query=cls.escape_query_for_search(query),
|
||||
external_retrieval_model=external_retrieval_model,
|
||||
)
|
||||
|
||||
end = time.perf_counter()
|
||||
logging.debug(f"External knowledge hit testing retrieve in {end - start:0.4f} seconds")
|
||||
|
||||
dataset_query = DatasetQuery(
|
||||
dataset_id=dataset.id, content=query, source="hit_testing", created_by_role="account", created_by=account.id
|
||||
)
|
||||
|
||||
db.session.add(dataset_query)
|
||||
db.session.commit()
|
||||
|
||||
return cls.compact_external_retrieve_response(dataset, query, all_documents)
|
||||
|
||||
@classmethod
|
||||
def compact_retrieve_response(cls, dataset: Dataset, query: str, documents: list[Document]):
|
||||
i = 0
|
||||
records = []
|
||||
|
||||
for document in documents:
|
||||
index_node_id = document.metadata["doc_id"]
|
||||
|
||||
@@ -81,7 +123,6 @@ class HitTestingService:
|
||||
)
|
||||
|
||||
if not segment:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
record = {
|
||||
@@ -91,8 +132,6 @@ class HitTestingService:
|
||||
|
||||
records.append(record)
|
||||
|
||||
i += 1
|
||||
|
||||
return {
|
||||
"query": {
|
||||
"content": query,
|
||||
@@ -100,6 +139,25 @@ class HitTestingService:
|
||||
"records": records,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def compact_external_retrieve_response(cls, dataset: Dataset, query: str, documents: list):
|
||||
records = []
|
||||
if dataset.provider == "external":
|
||||
for document in documents:
|
||||
record = {
|
||||
"content": document.get("content", None),
|
||||
"title": document.get("title", None),
|
||||
"score": document.get("score", None),
|
||||
"metadata": document.get("metadata", None),
|
||||
}
|
||||
records.append(record)
|
||||
return {
|
||||
"query": {
|
||||
"content": query,
|
||||
},
|
||||
"records": records,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def hit_testing_args_check(cls, args):
|
||||
query = args["query"]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import datetime
|
||||
import json
|
||||
|
||||
import requests
|
||||
from flask_login import current_user
|
||||
|
||||
from core.helper import encrypter
|
||||
@@ -65,6 +66,35 @@ class WebsiteService:
|
||||
time = str(datetime.datetime.now().timestamp())
|
||||
redis_client.setex(website_crawl_time_cache_key, 3600, time)
|
||||
return {"status": "active", "job_id": job_id}
|
||||
elif provider == "jinareader":
|
||||
api_key = encrypter.decrypt_token(
|
||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||
)
|
||||
crawl_sub_pages = options.get("crawl_sub_pages", False)
|
||||
if not crawl_sub_pages:
|
||||
response = requests.get(
|
||||
f"https://r.jina.ai/{url}",
|
||||
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
)
|
||||
if response.json().get("code") != 200:
|
||||
raise ValueError("Failed to crawl")
|
||||
return {"status": "active", "data": response.json().get("data")}
|
||||
else:
|
||||
response = requests.post(
|
||||
"https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
|
||||
json={
|
||||
"url": url,
|
||||
"maxPages": options.get("limit", 1),
|
||||
"useSitemap": options.get("use_sitemap", True),
|
||||
},
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
},
|
||||
)
|
||||
if response.json().get("code") != 200:
|
||||
raise ValueError("Failed to crawl")
|
||||
return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
|
||||
else:
|
||||
raise ValueError("Invalid provider")
|
||||
|
||||
@@ -93,6 +123,42 @@ class WebsiteService:
|
||||
time_consuming = abs(end_time - float(start_time))
|
||||
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
|
||||
redis_client.delete(website_crawl_time_cache_key)
|
||||
elif provider == "jinareader":
|
||||
api_key = encrypter.decrypt_token(
|
||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||
)
|
||||
response = requests.post(
|
||||
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
json={"taskId": job_id},
|
||||
)
|
||||
data = response.json().get("data", {})
|
||||
crawl_status_data = {
|
||||
"status": data.get("status", "active"),
|
||||
"job_id": job_id,
|
||||
"total": len(data.get("urls", [])),
|
||||
"current": len(data.get("processed", [])) + len(data.get("failed", [])),
|
||||
"data": [],
|
||||
"time_consuming": data.get("duration", 0) / 1000,
|
||||
}
|
||||
|
||||
if crawl_status_data["status"] == "completed":
|
||||
response = requests.post(
|
||||
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
|
||||
)
|
||||
data = response.json().get("data", {})
|
||||
formatted_data = [
|
||||
{
|
||||
"title": item.get("data", {}).get("title"),
|
||||
"source_url": item.get("data", {}).get("url"),
|
||||
"description": item.get("data", {}).get("description"),
|
||||
"markdown": item.get("data", {}).get("content"),
|
||||
}
|
||||
for item in data.get("processed", {}).values()
|
||||
]
|
||||
crawl_status_data["data"] = formatted_data
|
||||
else:
|
||||
raise ValueError("Invalid provider")
|
||||
return crawl_status_data
|
||||
@@ -100,6 +166,8 @@ class WebsiteService:
|
||||
@classmethod
|
||||
def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict | None:
|
||||
credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
|
||||
# decrypt api_key
|
||||
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
||||
if provider == "firecrawl":
|
||||
file_key = "website_files/" + job_id + ".txt"
|
||||
if storage.exists(file_key):
|
||||
@@ -107,8 +175,6 @@ class WebsiteService:
|
||||
if data:
|
||||
data = json.loads(data.decode("utf-8"))
|
||||
else:
|
||||
# decrypt api_key
|
||||
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
||||
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
|
||||
result = firecrawl_app.check_crawl_status(job_id)
|
||||
if result.get("status") != "completed":
|
||||
@@ -119,6 +185,40 @@ class WebsiteService:
|
||||
if item.get("source_url") == url:
|
||||
return item
|
||||
return None
|
||||
elif provider == "jinareader":
|
||||
file_key = "website_files/" + job_id + ".txt"
|
||||
if storage.exists(file_key):
|
||||
data = storage.load_once(file_key)
|
||||
if data:
|
||||
data = json.loads(data.decode("utf-8"))
|
||||
elif not job_id:
|
||||
response = requests.get(
|
||||
f"https://r.jina.ai/{url}",
|
||||
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
)
|
||||
if response.json().get("code") != 200:
|
||||
raise ValueError("Failed to crawl")
|
||||
return response.json().get("data")
|
||||
else:
|
||||
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
||||
response = requests.post(
|
||||
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
json={"taskId": job_id},
|
||||
)
|
||||
data = response.json().get("data", {})
|
||||
if data.get("status") != "completed":
|
||||
raise ValueError("Crawl job is not completed")
|
||||
|
||||
response = requests.post(
|
||||
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
|
||||
)
|
||||
data = response.json().get("data", {})
|
||||
for item in data.get("processed", {}).values():
|
||||
if item.get("data", {}).get("url") == url:
|
||||
return item.get("data", {})
|
||||
else:
|
||||
raise ValueError("Invalid provider")
|
||||
|
||||
|
||||
93
api/tasks/external_document_indexing_task.py
Normal file
93
api/tasks/external_document_indexing_task.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
|
||||
import click
|
||||
from celery import shared_task
|
||||
|
||||
from core.indexing_runner import DocumentIsPausedException
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_storage import storage
|
||||
from models.dataset import Dataset, ExternalKnowledgeApis
|
||||
from models.model import UploadFile
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
def external_document_indexing_task(
|
||||
dataset_id: str, external_knowledge_api_id: str, data_source: dict, process_parameter: dict
|
||||
):
|
||||
"""
|
||||
Async process document
|
||||
:param dataset_id:
|
||||
:param external_knowledge_api_id:
|
||||
:param data_source:
|
||||
:param process_parameter:
|
||||
Usage: external_document_indexing_task.delay(dataset_id, document_id)
|
||||
"""
|
||||
start_at = time.perf_counter()
|
||||
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
logging.info(
|
||||
click.style("Processed external dataset: {} failed, dataset not exit.".format(dataset_id), fg="red")
|
||||
)
|
||||
return
|
||||
|
||||
# get external api template
|
||||
external_knowledge_api = (
|
||||
db.session.query(ExternalKnowledgeApis)
|
||||
.filter(
|
||||
ExternalKnowledgeApis.id == external_knowledge_api_id, ExternalKnowledgeApis.tenant_id == dataset.tenant_id
|
||||
)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not external_knowledge_api:
|
||||
logging.info(
|
||||
click.style(
|
||||
"Processed external dataset: {} failed, api template: {} not exit.".format(
|
||||
dataset_id, external_knowledge_api_id
|
||||
),
|
||||
fg="red",
|
||||
)
|
||||
)
|
||||
return
|
||||
files = {}
|
||||
if data_source["type"] == "upload_file":
|
||||
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
|
||||
for file_id in upload_file_list:
|
||||
file = (
|
||||
db.session.query(UploadFile)
|
||||
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
||||
.first()
|
||||
)
|
||||
if file:
|
||||
files[file.id] = (file.name, storage.load_once(file.key), file.mime_type)
|
||||
try:
|
||||
settings = ExternalDatasetService.get_external_knowledge_api_settings(
|
||||
json.loads(external_knowledge_api.settings)
|
||||
)
|
||||
# assemble headers
|
||||
headers = ExternalDatasetService.assembling_headers(settings.authorization, settings.headers)
|
||||
|
||||
# do http request
|
||||
response = ExternalDatasetService.process_external_api(settings, headers, process_parameter, files)
|
||||
job_id = response.json().get("job_id")
|
||||
if job_id:
|
||||
# save job_id to dataset
|
||||
dataset.job_id = job_id
|
||||
db.session.commit()
|
||||
|
||||
end_at = time.perf_counter()
|
||||
logging.info(
|
||||
click.style(
|
||||
"Processed external dataset: {} successful, latency: {}".format(dataset.id, end_at - start_at),
|
||||
fg="green",
|
||||
)
|
||||
)
|
||||
except DocumentIsPausedException as ex:
|
||||
logging.info(click.style(str(ex), fg="yellow"))
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
38
api/tests/unit_tests/controllers/test_compare_versions.py
Normal file
38
api/tests/unit_tests/controllers/test_compare_versions.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import pytest
|
||||
|
||||
from controllers.console.version import _has_new_version
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("latest_version", "current_version", "expected"),
|
||||
[
|
||||
("1.0.1", "1.0.0", True),
|
||||
("1.1.0", "1.0.0", True),
|
||||
("2.0.0", "1.9.9", True),
|
||||
("1.0.0", "1.0.0", False),
|
||||
("1.0.0", "1.0.1", False),
|
||||
("1.0.0", "2.0.0", False),
|
||||
("1.0.1", "1.0.0-beta", True),
|
||||
("1.0.0", "1.0.0-alpha", True),
|
||||
("1.0.0-beta", "1.0.0-alpha", True),
|
||||
("1.0.0", "1.0.0-rc1", True),
|
||||
("1.0.0", "0.9.9", True),
|
||||
("1.0.0", "1.0.0-dev", True),
|
||||
],
|
||||
)
|
||||
def test_has_new_version(latest_version, current_version, expected):
|
||||
assert _has_new_version(latest_version=latest_version, current_version=current_version) == expected
|
||||
|
||||
|
||||
def test_has_new_version_invalid_input():
|
||||
with pytest.raises(ValueError):
|
||||
_has_new_version(latest_version="1.0", current_version="1.0.0")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_has_new_version(latest_version="1.0.0", current_version="1.0")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_has_new_version(latest_version="invalid", current_version="1.0.0")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_has_new_version(latest_version="1.0.0", current_version="invalid")
|
||||
@@ -7,5 +7,4 @@ pytest api/tests/integration_tests/vdb/chroma \
|
||||
api/tests/integration_tests/vdb/pgvector \
|
||||
api/tests/integration_tests/vdb/qdrant \
|
||||
api/tests/integration_tests/vdb/weaviate \
|
||||
api/tests/integration_tests/vdb/elasticsearch \
|
||||
api/tests/integration_tests/vdb/test_vector_store.py
|
||||
api/tests/integration_tests/vdb/elasticsearch
|
||||
@@ -2,7 +2,7 @@ version: '3'
|
||||
services:
|
||||
# API service
|
||||
api:
|
||||
image: langgenius/dify-api:0.8.3
|
||||
image: langgenius/dify-api:0.9.1-fix1
|
||||
restart: always
|
||||
environment:
|
||||
# Startup mode, 'api' starts the API server.
|
||||
@@ -227,7 +227,7 @@ services:
|
||||
# worker service
|
||||
# The Celery worker for processing the queue.
|
||||
worker:
|
||||
image: langgenius/dify-api:0.8.3
|
||||
image: langgenius/dify-api:0.9.1-fix1
|
||||
restart: always
|
||||
environment:
|
||||
CONSOLE_WEB_URL: ''
|
||||
@@ -396,7 +396,7 @@ services:
|
||||
|
||||
# Frontend web application.
|
||||
web:
|
||||
image: langgenius/dify-web:0.8.3
|
||||
image: langgenius/dify-web:0.9.1-fix1
|
||||
restart: always
|
||||
environment:
|
||||
# The base URL of console application api server, refers to the Console base URL of WEB service if console domain is
|
||||
|
||||
@@ -213,7 +213,7 @@ x-shared-env: &shared-api-worker-env
|
||||
services:
|
||||
# API service
|
||||
api:
|
||||
image: langgenius/dify-api:0.8.3
|
||||
image: langgenius/dify-api:0.9.1-fix1
|
||||
restart: always
|
||||
environment:
|
||||
# Use the shared environment variables.
|
||||
@@ -233,7 +233,7 @@ services:
|
||||
# worker service
|
||||
# The Celery worker for processing the queue.
|
||||
worker:
|
||||
image: langgenius/dify-api:0.8.3
|
||||
image: langgenius/dify-api:0.9.1-fix1
|
||||
restart: always
|
||||
environment:
|
||||
# Use the shared environment variables.
|
||||
@@ -252,7 +252,7 @@ services:
|
||||
|
||||
# Frontend web application.
|
||||
web:
|
||||
image: langgenius/dify-web:0.8.3
|
||||
image: langgenius/dify-web:0.9.1-fix1
|
||||
restart: always
|
||||
environment:
|
||||
CONSOLE_API_URL: ${CONSOLE_API_URL:-}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
'use client'
|
||||
import type { FC, SVGProps } from 'react'
|
||||
import React, { useEffect } from 'react'
|
||||
import React, { useEffect, useMemo } from 'react'
|
||||
import { usePathname } from 'next/navigation'
|
||||
import useSWR from 'swr'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
@@ -203,12 +203,23 @@ const DatasetDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
|
||||
datasetId,
|
||||
}, apiParams => fetchDatasetRelatedApps(apiParams.datasetId))
|
||||
|
||||
const navigation = [
|
||||
{ name: t('common.datasetMenus.documents'), href: `/datasets/${datasetId}/documents`, icon: DocumentTextIcon, selectedIcon: DocumentTextSolidIcon },
|
||||
{ name: t('common.datasetMenus.hitTesting'), href: `/datasets/${datasetId}/hitTesting`, icon: TargetIcon, selectedIcon: TargetSolidIcon },
|
||||
// { name: 'api & webhook', href: `/datasets/${datasetId}/api`, icon: CommandLineIcon, selectedIcon: CommandLineSolidIcon },
|
||||
{ name: t('common.datasetMenus.settings'), href: `/datasets/${datasetId}/settings`, icon: Cog8ToothIcon, selectedIcon: Cog8ToothSolidIcon },
|
||||
]
|
||||
const navigation = useMemo(() => {
|
||||
const baseNavigation = [
|
||||
{ name: t('common.datasetMenus.hitTesting'), href: `/datasets/${datasetId}/hitTesting`, icon: TargetIcon, selectedIcon: TargetSolidIcon },
|
||||
// { name: 'api & webhook', href: `/datasets/${datasetId}/api`, icon: CommandLineIcon, selectedIcon: CommandLineSolidIcon },
|
||||
{ name: t('common.datasetMenus.settings'), href: `/datasets/${datasetId}/settings`, icon: Cog8ToothIcon, selectedIcon: Cog8ToothSolidIcon },
|
||||
]
|
||||
|
||||
if (datasetRes?.provider !== 'external') {
|
||||
baseNavigation.unshift({
|
||||
name: t('common.datasetMenus.documents'),
|
||||
href: `/datasets/${datasetId}/documents`,
|
||||
icon: DocumentTextIcon,
|
||||
selectedIcon: DocumentTextSolidIcon,
|
||||
})
|
||||
}
|
||||
return baseNavigation
|
||||
}, [datasetRes?.provider, datasetId, t])
|
||||
|
||||
useEffect(() => {
|
||||
if (datasetRes)
|
||||
@@ -233,6 +244,7 @@ const DatasetDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
|
||||
icon={datasetRes?.icon || 'https://static.dify.ai/images/dataset-default-icon.png'}
|
||||
icon_background={datasetRes?.icon_background || '#F5F5F5'}
|
||||
desc={datasetRes?.description || '--'}
|
||||
isExternal={datasetRes?.provider === 'external'}
|
||||
navigation={navigation}
|
||||
extraInfo={!isCurrentWorkspaceDatasetOperator ? mode => <ExtraInfo isMobile={mode === 'collapse'} relatedApps={relatedApps} /> : undefined}
|
||||
iconType={datasetRes?.data_source_type === DataSourceType.NOTION ? 'notion' : 'dataset'}
|
||||
|
||||
@@ -8,6 +8,7 @@ import { useDebounceFn } from 'ahooks'
|
||||
import useSWR from 'swr'
|
||||
|
||||
// Components
|
||||
import ExternalAPIPanel from '../../components/datasets/external-api/external-api-panel'
|
||||
import Datasets from './Datasets'
|
||||
import DatasetFooter from './DatasetFooter'
|
||||
import ApiServer from './ApiServer'
|
||||
@@ -16,6 +17,8 @@ import TabSliderNew from '@/app/components/base/tab-slider-new'
|
||||
import SearchInput from '@/app/components/base/search-input'
|
||||
import TagManagementModal from '@/app/components/base/tag-management'
|
||||
import TagFilter from '@/app/components/base/tag-management/filter'
|
||||
import Button from '@/app/components/base/button'
|
||||
import { ApiConnectionMod } from '@/app/components/base/icons/src/vender/solid/development'
|
||||
|
||||
// Services
|
||||
import { fetchDatasetApiBaseUrl } from '@/service/datasets'
|
||||
@@ -24,12 +27,14 @@ import { fetchDatasetApiBaseUrl } from '@/service/datasets'
|
||||
import { useTabSearchParams } from '@/hooks/use-tab-searchparams'
|
||||
import { useStore as useTagStore } from '@/app/components/base/tag-management/store'
|
||||
import { useAppContext } from '@/context/app-context'
|
||||
import { useExternalApiPanel } from '@/context/external-api-panel-context'
|
||||
|
||||
const Container = () => {
|
||||
const { t } = useTranslation()
|
||||
const router = useRouter()
|
||||
const { currentWorkspace } = useAppContext()
|
||||
const showTagManagementModal = useTagStore(s => s.showTagManagementModal)
|
||||
const { showExternalApiPanel, setShowExternalApiPanel } = useExternalApiPanel()
|
||||
|
||||
const options = useMemo(() => {
|
||||
return [
|
||||
@@ -66,7 +71,7 @@ const Container = () => {
|
||||
useEffect(() => {
|
||||
if (currentWorkspace.role === 'normal')
|
||||
return router.replace('/apps')
|
||||
}, [currentWorkspace])
|
||||
}, [currentWorkspace, router])
|
||||
|
||||
return (
|
||||
<div ref={containerRef} className='grow relative flex flex-col bg-gray-100 overflow-y-auto'>
|
||||
@@ -80,11 +85,18 @@ const Container = () => {
|
||||
<div className='flex items-center gap-2'>
|
||||
<TagFilter type='knowledge' value={tagFilterValue} onChange={handleTagsChange} />
|
||||
<SearchInput className='w-[200px]' value={keywords} onChange={handleKeywordsChange} />
|
||||
<div className="w-[1px] h-4 bg-divider-regular" />
|
||||
<Button
|
||||
className='gap-0.5 shadows-shadow-xs'
|
||||
onClick={() => setShowExternalApiPanel(true)}
|
||||
>
|
||||
<ApiConnectionMod className='w-4 h-4 text-components-button-secondary-text' />
|
||||
<div className='flex px-0.5 justify-center items-center gap-1 text-components-button-secondary-text system-sm-medium'>{t('dataset.externalAPIPanelTitle')}</div>
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
{activeTab === 'api' && data && <ApiServer apiBaseUrl={data.api_base_url || ''} />}
|
||||
</div>
|
||||
|
||||
{activeTab === 'dataset' && (
|
||||
<>
|
||||
<Datasets containerRef={containerRef} tags={tagIDs} keywords={searchKeywords} />
|
||||
@@ -94,10 +106,10 @@ const Container = () => {
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
|
||||
{activeTab === 'api' && data && <Doc apiBaseUrl={data.api_base_url || ''} />}
|
||||
</div>
|
||||
|
||||
{showExternalApiPanel && <ExternalAPIPanel onClose={() => setShowExternalApiPanel(false)} />}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ import Divider from '@/app/components/base/divider'
|
||||
import RenameDatasetModal from '@/app/components/datasets/rename-modal'
|
||||
import type { Tag } from '@/app/components/base/tag-management/constant'
|
||||
import TagSelector from '@/app/components/base/tag-management/selector'
|
||||
import CornerLabel from '@/app/components/base/corner-label'
|
||||
import { useAppContext } from '@/context/app-context'
|
||||
|
||||
export type DatasetCardProps = {
|
||||
@@ -32,6 +33,7 @@ const DatasetCard = ({
|
||||
const { t } = useTranslation()
|
||||
const { notify } = useContext(ToastContext)
|
||||
const { push } = useRouter()
|
||||
const EXTERNAL_PROVIDER = 'external' as const
|
||||
|
||||
const { isCurrentWorkspaceDatasetOperator } = useAppContext()
|
||||
const [tags, setTags] = useState<Tag[]>(dataset.tags)
|
||||
@@ -39,6 +41,7 @@ const DatasetCard = ({
|
||||
const [showRenameModal, setShowRenameModal] = useState(false)
|
||||
const [showConfirmDelete, setShowConfirmDelete] = useState(false)
|
||||
const [confirmMessage, setConfirmMessage] = useState<string>('')
|
||||
const isExternalProvider = (provider: string): boolean => provider === EXTERNAL_PROVIDER
|
||||
const detectIsUsedByApp = useCallback(async () => {
|
||||
try {
|
||||
const { is_using: isUsedByApp } = await checkIsUsedInApp(dataset.id)
|
||||
@@ -108,13 +111,16 @@ const DatasetCard = ({
|
||||
return (
|
||||
<>
|
||||
<div
|
||||
className='group col-span-1 bg-white border-2 border-solid border-transparent rounded-xl shadow-sm min-h-[160px] flex flex-col transition-all duration-200 ease-in-out cursor-pointer hover:shadow-lg'
|
||||
className='group relative col-span-1 bg-white border-[0.5px] border-solid border-transparent rounded-xl shadow-sm min-h-[160px] flex flex-col transition-all duration-200 ease-in-out cursor-pointer hover:shadow-lg'
|
||||
data-disable-nprogress={true}
|
||||
onClick={(e) => {
|
||||
e.preventDefault()
|
||||
push(`/datasets/${dataset.id}/documents`)
|
||||
isExternalProvider(dataset.provider)
|
||||
? push(`/datasets/${dataset.id}/hitTesting`)
|
||||
: push(`/datasets/${dataset.id}/documents`)
|
||||
}}
|
||||
>
|
||||
{isExternalProvider(dataset.provider) && <CornerLabel label='External' className='absolute right-0' labelClassName='rounded-tr-xl' />}
|
||||
<div className='flex pt-[14px] px-[14px] pb-3 h-[66px] items-center gap-3 grow-0 shrink-0'>
|
||||
<div className={cn(
|
||||
'shrink-0 flex items-center justify-center p-2.5 bg-[#F5F8FF] rounded-md border-[0.5px] border-[#E0EAFF]',
|
||||
@@ -136,13 +142,20 @@ const DatasetCard = ({
|
||||
<div className='flex items-center mt-[1px] text-xs leading-[18px] text-gray-500'>
|
||||
<div
|
||||
className={cn('truncate', (!dataset.embedding_available || !dataset.document_count) && 'opacity-50')}
|
||||
title={`${dataset.document_count}${t('dataset.documentCount')} · ${Math.round(dataset.word_count / 1000)}${t('dataset.wordCount')} · ${dataset.app_count}${t('dataset.appCount')}`}
|
||||
title={dataset.provider === 'external' ? `${dataset.app_count}${t('dataset.appCount')}` : `${dataset.document_count}${t('dataset.documentCount')} · ${Math.round(dataset.word_count / 1000)}${t('dataset.wordCount')} · ${dataset.app_count}${t('dataset.appCount')}`}
|
||||
>
|
||||
<span>{dataset.document_count}{t('dataset.documentCount')}</span>
|
||||
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
|
||||
<span>{Math.round(dataset.word_count / 1000)}{t('dataset.wordCount')}</span>
|
||||
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
|
||||
<span>{dataset.app_count}{t('dataset.appCount')}</span>
|
||||
{dataset.provider === 'external'
|
||||
? <>
|
||||
<span>{dataset.app_count}{t('dataset.appCount')}</span>
|
||||
</>
|
||||
: <>
|
||||
<span>{dataset.document_count}{t('dataset.documentCount')}</span>
|
||||
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
|
||||
<span>{Math.round(dataset.word_count / 1000)}{t('dataset.wordCount')}</span>
|
||||
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
|
||||
<span>{dataset.app_count}{t('dataset.appCount')}</span>
|
||||
</>
|
||||
}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -4,21 +4,32 @@ import { forwardRef } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import {
|
||||
RiAddLine,
|
||||
RiArrowRightLine,
|
||||
} from '@remixicon/react'
|
||||
|
||||
const CreateAppCard = forwardRef<HTMLAnchorElement>((_, ref) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
return (
|
||||
<a ref={ref} className='group flex flex-col col-span-1 bg-gray-200 border-[0.5px] border-black/5 rounded-xl min-h-[160px] transition-all duration-200 ease-in-out cursor-pointer hover:bg-white hover:shadow-lg' href='/datasets/create'>
|
||||
<div className='shrink-0 flex items-center p-4 pb-3'>
|
||||
<div className='w-10 h-10 flex items-center justify-center border border-gray-200 bg-gray-100 rounded-lg'>
|
||||
<RiAddLine className='w-4 h-4 text-gray-500'/>
|
||||
<div className='flex flex-col bg-background-default-dimm border-[0.5px] border-components-panel-border rounded-xl
|
||||
min-h-[160px] transition-all duration-200 ease-in-out'
|
||||
>
|
||||
<a ref={ref} className='group flex flex-grow items-start p-4 cursor-pointer' href='/datasets/create'>
|
||||
<div className='flex items-center gap-3'>
|
||||
<div className='w-10 h-10 p-2 flex items-center justify-center border border-dashed border-divider-regular rounded-lg
|
||||
bg-background-default-lighter group-hover:border-solid group-hover:border-effects-highlight group-hover:bg-background-default-dodge'
|
||||
>
|
||||
<RiAddLine className='w-4 h-4 text-text-tertiary group-hover:text-text-accent'/>
|
||||
</div>
|
||||
<div className='system-md-semibold text-text-secondary group-hover:text-text-accent'>{t('dataset.createDataset')}</div>
|
||||
</div>
|
||||
<div className='ml-3 text-sm font-semibold leading-5 text-gray-800 group-hover:text-primary-600'>{t('dataset.createDataset')}</div>
|
||||
</div>
|
||||
<div className='mb-1 px-4 text-xs leading-normal text-gray-500 line-clamp-4'>{t('dataset.createDatasetIntro')}</div>
|
||||
</a>
|
||||
</a>
|
||||
<div className='p-4 pt-0 text-text-tertiary system-xs-regular'>{t('dataset.createDatasetIntro')}</div>
|
||||
<a className='group flex p-4 items-center gap-1 border-t-[0.5px] border-divider-subtle rounded-b-xl cursor-pointer' href='/datasets/connect'>
|
||||
<div className='system-xs-medium text-text-tertiary group-hover:text-text-accent'>{t('dataset.connectDataset')}</div>
|
||||
<RiArrowRightLine className='w-3.5 h-3.5 text-text-tertiary group-hover:text-text-accent' />
|
||||
</a>
|
||||
</div>
|
||||
)
|
||||
})
|
||||
|
||||
|
||||
8
web/app/(commonLayout)/datasets/connect/page.tsx
Normal file
8
web/app/(commonLayout)/datasets/connect/page.tsx
Normal file
@@ -0,0 +1,8 @@
|
||||
import React from 'react'
|
||||
import ExternalKnowledgeBaseConnector from '@/app/components/datasets/external-knowledge-base/connector'
|
||||
|
||||
const ExternalKnowledgeBaseCreation = () => {
|
||||
return <ExternalKnowledgeBaseConnector />
|
||||
}
|
||||
|
||||
export default ExternalKnowledgeBaseCreation
|
||||
14
web/app/(commonLayout)/datasets/layout.tsx
Normal file
14
web/app/(commonLayout)/datasets/layout.tsx
Normal file
@@ -0,0 +1,14 @@
|
||||
'use client'
|
||||
|
||||
import { ExternalApiPanelProvider } from '@/context/external-api-panel-context'
|
||||
import { ExternalKnowledgeApiProvider } from '@/context/external-knowledge-api-context'
|
||||
|
||||
export default function DatasetsLayout({ children }: { children: React.ReactNode }) {
|
||||
return (
|
||||
<ExternalKnowledgeApiProvider>
|
||||
<ExternalApiPanelProvider>
|
||||
{children}
|
||||
</ExternalApiPanelProvider>
|
||||
</ExternalKnowledgeApiProvider>
|
||||
)
|
||||
}
|
||||
@@ -1,9 +1,7 @@
|
||||
import Container from './Container'
|
||||
|
||||
const AppList = async () => {
|
||||
return (
|
||||
<Container />
|
||||
)
|
||||
return <Container />
|
||||
}
|
||||
|
||||
export const metadata = {
|
||||
|
||||
11
web/app/(commonLayout)/datasets/store.ts
Normal file
11
web/app/(commonLayout)/datasets/store.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
import { create } from 'zustand'
|
||||
|
||||
type DatasetStore = {
|
||||
showExternalApiPanel: boolean
|
||||
setShowExternalApiPanel: (show: boolean) => void
|
||||
}
|
||||
|
||||
export const useDatasetStore = create<DatasetStore>(set => ({
|
||||
showExternalApiPanel: false,
|
||||
setShowExternalApiPanel: show => set({ showExternalApiPanel: show }),
|
||||
}))
|
||||
@@ -1,4 +1,5 @@
|
||||
import React from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import AppIcon from '../base/app-icon'
|
||||
import Tooltip from '@/app/components/base/tooltip'
|
||||
|
||||
@@ -6,6 +7,7 @@ export type IAppBasicProps = {
|
||||
iconType?: 'app' | 'api' | 'dataset' | 'webapp' | 'notion'
|
||||
icon?: string
|
||||
icon_background?: string | null
|
||||
isExternal?: boolean
|
||||
name: string
|
||||
type: string | React.ReactNode
|
||||
hoverTip?: string
|
||||
@@ -52,7 +54,9 @@ const ICON_MAP = {
|
||||
notion: <AppIcon innerIcon={NotionSvg} className='!border-[0.5px] !border-indigo-100 !bg-white' />,
|
||||
}
|
||||
|
||||
export default function AppBasic({ icon, icon_background, name, type, hoverTip, textStyle, mode = 'expand', iconType = 'app' }: IAppBasicProps) {
|
||||
export default function AppBasic({ icon, icon_background, name, isExternal, type, hoverTip, textStyle, mode = 'expand', iconType = 'app' }: IAppBasicProps) {
|
||||
const { t } = useTranslation()
|
||||
|
||||
return (
|
||||
<div className="flex items-start p-1">
|
||||
{icon && icon_background && iconType === 'app' && (
|
||||
@@ -83,6 +87,7 @@ export default function AppBasic({ icon, icon_background, name, type, hoverTip,
|
||||
}
|
||||
</div>
|
||||
<div className={`text-xs font-normal text-gray-500 group-hover:text-gray-700 break-all ${textStyle?.extra ?? ''}`}>{type}</div>
|
||||
<div className='text-text-tertiary system-2xs-medium-uppercase'>{isExternal ? t('dataset.externalTag') : ''}</div>
|
||||
</div>}
|
||||
</div>
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user