Feat: Resolved langgenius/dify#17150 by achieving recursive metadata subqueries(Backend)
Summary
-
We have implemented a complex query mechanism to resolve https://github.com/langgenius/dify/issues/17150 by adopting a recursive strategy to support multi-layered nested subqueries.
-
Additionally, to ensure compatibility with historical data, we introduced a new option
metadata_filtering_mode = 'complex_conditions', and added a new fieldmetadata_filtering_complex_conditionsinKnowledgeRetrievalNodeData -
This PR involves only backend code modifications and essential frontend adjustments, no UI involved.
Screenshots
| Key | Before | After |
|---|---|---|
| UI Changes | ... | |
| Metadata Filter SQL | ... |
Checklist
[!IMPORTANT]
Please review the checklist below before submitting your pull request.
- [ ] This change requires a documentation update, included: Dify Document
- [x] I understand that this PR may be closed in case there was no previous discussion or issues. (This doesn't apply to typos!)
- [x] I've added a test for each change that was introduced, and I tried as much as possible to make a single atomic change.
- [x] I've updated the documentation accordingly.
- [x] I ran
dev/reformat(backend) andcd web && npx lint-staged(frontend) to appease the lint gods
Please take a look , thank you @JohnJyong @Yawen-1010
Hi, @hsiong. Thank you for making this metadata enhancement. Can you provide more explanation & screenshot of complexConditions? Especially about how users can modify the relation of conditions here. For users, implementing two layers of nested AND and OR logic requires UI design and front-end changes.
Hi, @hsiong. Thank you for making this metadata enhancement. Can you provide more explanation & screenshot of complexConditions? Especially about how users can modify the relation of conditions here. For users, implementing two layers of nested AND and OR logic requires UI design and front-end changes.
Inputs
Here is my test curl code used to add metadata_filtering_complex_conditions to the knowledge-retrieval node. Please replace app_id web_auth knowledge_retrieval_node_id dataset_id with the real data.
curl 'http://localhost:5001/console/api/apps/{app_id}/workflows/draft' \
-X POST \
-H 'User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0' \
-H 'Accept: */*' \
-H 'Accept-Language: en-US,en;q=0.5' \
-H 'Accept-Encoding: gzip, deflate, br' \
-H 'Referer: http://localhost:3000/' \
-H 'Authorization: Bearer {web_auth}' \
-H 'Content-Type: application/json' \
-H 'Origin: http://localhost:3000' \
-H 'Connection: keep-alive' \
-H 'Cookie: locale=en-US' \
-H 'Sec-Fetch-Dest: empty' \
-H 'Sec-Fetch-Mode: cors' \
-H 'Sec-Fetch-Site: same-site' \
--data-raw '{
"graph": {
"nodes": [
{
"id": "1744808299117",
"type": "custom",
"data": {
"type": "start",
"title": "Start",
"desc": "",
"variables": [
{
"variable": "query",
"label": "query",
"type": "text-input",
"max_length": 48,
"required": true,
"options": []
}
],
"selected": false
},
"position": {
"x": 80,
"y": 282
},
"targetPosition": "left",
"sourcePosition": "right",
"positionAbsolute": {
"x": 80,
"y": 282
},
"width": 244,
"height": 90,
"selected": false
},
{
"id": "1744808572132",
"type": "custom",
"data": {
"type": "end",
"title": "End",
"desc": "",
"outputs": [],
"selected": false
},
"position": {
"x": 688,
"y": 282
},
"targetPosition": "left",
"sourcePosition": "right",
"positionAbsolute": {
"x": 688,
"y": 282
},
"width": 244,
"height": 54
},
{
"id": "{knowledge_retrieval_node_id}",
"type": "custom",
"data": {
"type": "knowledge-retrieval",
"title": "Knowledge Retrieval",
"desc": "",
"query_variable_selector": [],
"dataset_ids": ["{dataset_id}"],
"retrieval_mode": "multiple",
"multiple_retrieval_config": {
"top_k": 4,
"reranking_model": {
"provider": "langgenius/jina/jina",
"model": "jina-colbert-v1-en"
},
"reranking_mode": "reranking_model",
"reranking_enable": false
},
"selected": true,
"metadata_filtering_mode": "complex_conditions",
"metadata_filtering_complex_conditions": {
"logical_operator": "and",
"conditions": null,
"sub_conditions": [
{
"logical_operator": "or",
"conditions": [
{
"name": "area",
"comparison_operator": "contains",
"value": "<1>"
},
{
"name": "area",
"comparison_operator": "contains",
"value": "<-1>"
}
],
"sub_conditions": [
{
"logical_operator": "and",
"conditions": [
{
"name": "job",
"comparison_operator": "contains",
"value": "<2>"
},
{
"name": "job",
"comparison_operator": "contains",
"value": "<-2>"
}
],
"sub_conditions": null
}
]
},
{
"logical_operator": "or",
"conditions": [
{
"name": "dept",
"comparison_operator": "contains",
"value": "<4>"
},
{
"name": "dept",
"comparison_operator": "contains",
"value": "<-4>"
}
]
}
]
}
},
"position": {
"x": 384,
"y": 282
},
"targetPosition": "left",
"sourcePosition": "right",
"positionAbsolute": {
"x": 384,
"y": 282
},
"width": 244,
"height": 92,
"selected": true
}
],
"edges": [
{
"id": "1744808299117-source-1744962408916-target",
"type": "custom",
"source": "1744808299117",
"sourceHandle": "source",
"target": "1744962408916",
"targetHandle": "target",
"data": {
"sourceType": "start",
"targetType": "knowledge-retrieval",
"isInIteration": false,
"isInLoop": false
},
"zIndex": 0
}
],
"viewport": {
"x": 105.26445020659747,
"y": -83.09128901243528,
"zoom": 0.7371346086455508
}
},
"features": {
"opening_statement": "",
"suggested_questions": [],
"suggested_questions_after_answer": {
"enabled": false
},
"text_to_speech": {
"enabled": false,
"voice": "",
"language": ""
},
"speech_to_text": {
"enabled": false
},
"retriever_resource": {
"enabled": true
},
"sensitive_word_avoidance": {
"enabled": false
},
"file_upload": {
"image": {
"enabled": false,
"number_limits": 3,
"transfer_methods": ["local_file", "remote_url"]
},
"enabled": false,
"allowed_file_types": ["image"],
"allowed_file_extensions": [".JPG", ".JPEG", ".PNG", ".GIF", ".WEBP", ".SVG"],
"allowed_file_upload_methods": ["local_file", "remote_url"],
"number_limits": 3,
"fileUploadConfig": {
"file_size_limit": 15,
"batch_count_limit": 5,
"image_file_size_limit": 10,
"video_file_size_limit": 100,
"audio_file_size_limit": 50,
"workflow_file_upload_limit": 10
}
}
},
"environment_variables": [],
"conversation_variables": [],
"hash": "{hash_code}"
}'
-
The core modification in API
apps/{app_id}/workflows/draftis the addition of a new propertycomplex_conditionsto themetada_filtering_modefield of nodes withtype = knowledge-retrieval. -
Additionally, a new field
metadata_filtering_complex_conditionshas been introduced to this node to store information related to complex conditions -
the structure of
metadata_filtering_complex_conditionsis as follows:class MetadataFilteringComplexCondition(BaseModel): logical_operator: Optional[Literal["and", "or"]] = "and" conditions: Optional[list[Condition]] = Field(default=None, deprecated=True) sub_conditions: Optional[list["MetadataFilteringComplexCondition"]] = NoneMetadataFilteringComplexCondition: A complex condition model used for metadata filtering with support for nested logical operationslogical_operator: Logical operator to apply between conditions. Defaults to "and". Can be either "and" or "or"conditions: A list of basic filtering conditionssub_conditions: A list of nested complex conditions allowing for recursive logical structures
Metadata Filtering Test Result
After adding
metadata_filtering_complex_conditionsto theknowledge-retrievalnode, we can test metadata filtering complex condition withknowledge-retrievalnode. Just Test Run Knowledge RetrievalSQL Log
2025-04-18 07:31:29,302 INFO sqlalchemy.engine.Engine SELECT documents.id AS documents_id, documents.tenant_id AS documents_tenant_id, documents.dataset_id AS documents_dataset_id, documents.position AS documents_position, documents.data_source_type AS documents_data_source_type, documents.data_source_info AS documents_data_source_info, documents.dataset_process_rule_id AS documents_dataset_process_rule_id, documents.batch AS documents_batch, documents.name AS documents_name, documents.created_from AS documents_created_from, documents.created_by AS documents_created_by, documents.created_api_request_id AS documents_created_api_request_id, documents.created_at AS documents_created_at, documents.processing_started_at AS documents_processing_started_at, documents.file_id AS documents_file_id, documents.word_count AS documents_word_count, documents.parsing_completed_at AS documents_parsing_completed_at, documents.cleaning_completed_at AS documents_cleaning_completed_at, documents.splitting_completed_at AS documents_splitting_completed_at, documents.tokens AS documents_tokens, documents.indexing_latency AS documents_indexing_latency, documents.completed_at AS documents_completed_at, documents.is_paused AS documents_is_paused, documents.paused_by AS documents_paused_by, documents.paused_at AS documents_paused_at, documents.error AS documents_error, documents.stopped_at AS documents_stopped_at, documents.indexing_status AS documents_indexing_status, documents.enabled AS documents_enabled, documents.disabled_at AS documents_disabled_at, documents.disabled_by AS documents_disabled_by, documents.archived AS documents_archived, documents.archived_reason AS documents_archived_reason, documents.archived_by AS documents_archived_by, documents.archived_at AS documents_archived_at, documents.updated_at AS documents_updated_at, documents.doc_type AS documents_doc_type, documents.doc_metadata AS documents_doc_metadata, documents.doc_form AS documents_doc_form, documents.doc_language AS documents_doc_language FROM documents WHERE documents.dataset_id IN (%(dataset_id_1_1)s::UUID) AND documents.indexing_status = %(indexing_status_1)s AND documents.enabled = true AND documents.archived = false AND (documents.doc_metadata ->> %(job_0)s LIKE %(job_0_value)s AND documents.doc_metadata ->> %(job_1)s LIKE %(job_1_value)s OR documents.doc_metadata ->> %(area_0)s LIKE %(area_0_value)s OR documents.doc_metadata ->> %(area_1)s LIKE %(area_1_value)s) AND (documents.doc_metadata ->> %(dept_0)s LIKE %(dept_0_value)s OR documents.doc_metadata ->> %(dept_1)s LIKE %(dept_1_value)s) 2025-04-18 07:31:29,302 INFO sqlalchemy.engine.Engine [generated in 0.00135s] {'indexing_status_1': 'completed', 'job_0': 'job', 'job_0_value': '%<2>%', 'job_1': 'job', 'job_1_value': '%<-2>%', 'area_0': 'area', 'area_0_value': '%<1>%', 'area_1': 'area', 'area_1_value': '%<-1>%', 'dept_0': 'dept', 'dept_0_value': '%<4>%', 'dept_1': 'dept', 'dept_1_value': '%<-4>%', 'dataset_id_1_1': '{dataset_id}'}Combined SQL Log
SELECT
documents.id AS documents_id,
documents.tenant_id AS documents_tenant_id,
documents.dataset_id AS documents_dataset_id,
documents.position AS documents_position,
documents.data_source_type AS documents_data_source_type,
documents.data_source_info AS documents_data_source_info,
documents.dataset_process_rule_id AS documents_dataset_process_rule_id,
documents.batch AS documents_batch,
documents.name AS documents_name,
documents.created_from AS documents_created_from,
documents.created_by AS documents_created_by,
documents.created_api_request_id AS documents_created_api_request_id,
documents.created_at AS documents_created_at,
documents.processing_started_at AS documents_processing_started_at,
documents.file_id AS documents_file_id,
documents.word_count AS documents_word_count,
documents.parsing_completed_at AS documents_parsing_completed_at,
documents.cleaning_completed_at AS documents_cleaning_completed_at,
documents.splitting_completed_at AS documents_splitting_completed_at,
documents.tokens AS documents_tokens,
documents.indexing_latency AS documents_indexing_latency,
documents.completed_at AS documents_completed_at,
documents.is_paused AS documents_is_paused,
documents.paused_by AS documents_paused_by,
documents.paused_at AS documents_paused_at,
documents.error AS documents_error,
documents.stopped_at AS documents_stopped_at,
documents.indexing_status AS documents_indexing_status,
documents.enabled AS documents_enabled,
documents.disabled_at AS documents_disabled_at,
documents.disabled_by AS documents_disabled_by,
documents.archived AS documents_archived,
documents.archived_reason AS documents_archived_reason,
documents.archived_by AS documents_archived_by,
documents.archived_at AS documents_archived_at,
documents.updated_at AS documents_updated_at,
documents.doc_type AS documents_doc_type,
documents.doc_metadata AS documents_doc_metadata,
documents.doc_form AS documents_doc_form,
documents.doc_language AS documents_doc_language
FROM documents
WHERE
documents.dataset_id IN ('{dataset_id}')
AND documents.indexing_status = 'completed'
AND documents.enabled = true
AND documents.archived = false
AND (
documents.doc_metadata ->> 'job' LIKE '%<2>%'
AND documents.doc_metadata ->> 'job' LIKE '%<-2>%'
OR documents.doc_metadata ->> 'area' LIKE '%<1>%'
OR documents.doc_metadata ->> 'area' LIKE '%<-1>%'
)
AND (
documents.doc_metadata ->> 'dept' LIKE '%<4>%'
OR documents.doc_metadata ->> 'dept' LIKE '%<-4>%'
);
Additional Context
Sorry for I haven't finished the frontend code yet - I'm not experienced with React.
Here is my initial UI design. Hope someone in the community can help develop the frontend page.
Add metadata_filtering_complex_conditions
Add metadata conditions & sub_conditions
Please take a look, thank you @Yawen-1010