# -*- coding: utf-8 -*- """ AgentLoop Dataset 数据查询示例演示 Dataset 支持的四种查询模式： 1. 全文检索 field:keyword 关键词匹配 2. 语义搜索 similarity()（检索语法）/ semantic_distance()（SQL 函数） 3. SQL 分析标准 SQL (GROUP BY / WHERE / ORDER BY) 4. 组合查询检索条件 | SQL 语句，自由组合查询语法： ① 全文检索: field:keyword ② 语义搜索: similarity(field, 'text') < threshold ③ SQL 分析: SELECT ... FROM dataset_name ... ④ 组合查询: <检索条件> | 语义搜索提供两种形式： ▸ 检索语法: similarity(field, 'text') < threshold → 用于 | 左侧检索条件 ▸ SQL 函数: semantic_distance(field, 'text') → 用于 | 右侧 SQL 语句中所有查询统一通过 ExecuteQuery(type="SQL") 接口执行。使用前请设置环境变量： - SLSDEMO_ALIYUN_ACCESS_KEY_ID - SLSDEMO_ALIYUN_ACCESS_KEY_SECRET - ALIBABA_CLOUD_CMS_ENDPOINT (如: cms.cn-shanghai.aliyuncs.com) - ALIBABA_CLOUD_CMS_WORKSPACE """ import json import os from alibabacloud_cms20240330.client import Client from alibabacloud_cms20240330.models import ExecuteQueryRequest from alibabacloud_tea_openapi.models import Config from dotenv import load_dotenv load_dotenv() DATASET_NAME = "dataset_demo" def get_client() -> Client: config = Config( access_key_id=os.getenv("SLSDEMO_ALIYUN_ACCESS_KEY_ID"), access_key_secret=os.getenv("SLSDEMO_ALIYUN_ACCESS_KEY_SECRET"), endpoint=os.getenv("ALIBABA_CLOUD_CMS_ENDPOINT"), ) return Client(config) def query(client: Client, workspace: str, query_str: str) -> dict: """统一查询入口（type 固定为 SQL）""" request = ExecuteQueryRequest(query=query_str, type="SQL") response = client.execute_query(workspace, DATASET_NAME, request) return response.body.to_map() def show_results(title: str, query_str: str, result: dict, max_items: int = 5): """格式化打印查询语句和结果""" print(f"\n{'=' * 60}") print(f" {title}") print(f"{'=' * 60}") lines = [l.strip() for l in query_str.strip().splitlines() if l.strip()] if len(lines) == 1: print(f"Query: {lines[0]}") else: print("Query:") for line in lines: print(f" {line}") meta = result.get("meta", {}) data = result.get("data", []) count = meta.get("count", len(data)) elapsed = meta.get("elapsedMillisecond", "?") print(f"\n返回 {count} 条结果, 耗时 {elapsed}ms") for i, item in enumerate(data[:max_items]): print(f"\n--- 第 {i + 1} 条 ---") print(json.dumps(item, indent=2, ensure_ascii=False)) if len(data) > max_items: print(f"\n... 还有 {len(data) - max_items} 条结果未显示") # ============================================================================= # 示例 1: 全文检索 # ============================================================================= def example_fulltext_search(client: Client, workspace: str): """ 全文检索——关键词匹配语法与 SLS 查询语法一致： - field:keyword 单字段匹配 - field1:v1 AND field2:v2 组合条件 - field:v1 OR field:v2 或条件 - score > 0.9 数值范围可单独使用，也可通过管道 | 与 SQL 组合。 """ # 1a. 全文检索: 只需检索表达式 q = "input:错误" result = query(client, workspace, q) show_results("全文检索: input 包含 '错误'", q, result) # 1b. 全文 + SQL: 通过管道选择字段、排序 q = f"input:错误 | SELECT input, score FROM {DATASET_NAME} ORDER BY score DESC LIMIT 5" result = query(client, workspace, q) show_results("全文检索 + SQL: 过滤字段并排序", q, result) # 1c. 多条件全文检索 q = "model:qwen-plus AND score > 0.9" result = query(client, workspace, q) show_results("全文检索: model=qwen-plus 且 score > 0.9", q, result) # ============================================================================= # 示例 2: 语义搜索 # ============================================================================= def example_semantic_search(client: Client, workspace: str): """ 语义搜索——向量相似度提供两种使用形式：形式一 - 检索语法 similarity()： - 用于 | 左侧检索条件中 - similarity(field, 'query_text') < threshold - 可与全文检索条件通过 AND/OR 组合形式二 - SQL 函数 semantic_distance()： - 用于 | 右侧 SQL 语句中 - 可在 SELECT（获取距离值）、WHERE（过滤）、ORDER BY（排序）中使用两种形式的阈值范围均为 0-1，值越小越相似。建议阈值：0.1-0.2（严格），0.2-0.3（常规），0.3-0.5（宽松）前提：对应字段已在 Schema 中配置 embedding 向量索引 """ # 2a. 检索语法 similarity(): 用于 | 左侧 q = f"similarity(input, '日志查询统计') < 0.3" result = query(client, workspace, q) show_results("语义搜索 (similarity 检索语法): 与 '日志查询统计' 相似", q, result) # 2b. similarity() + 全文检索组合 q = f"similarity(input, 'API调用分析') < 0.3 AND score > 0.8" result = query(client, workspace, q) show_results("语义搜索 + 全文: 语义相似且 score > 0.8", q, result) # 2c. SQL 函数 semantic_distance(): 用于 | 右侧 SQL q = ( f"SELECT input, semantic_distance(input, '日志查询统计') AS similarity\n" f"FROM {DATASET_NAME}\n" f"WHERE semantic_distance(input, '日志查询统计') < 0.3\n" f"ORDER BY semantic_distance(input, '日志查询统计') ASC" ) result = query(client, workspace, q) show_results("语义搜索 (semantic_distance SQL 函数): 精确距离分析", q, result) # 2d. semantic_distance() + 条件过滤 q = ( f"SELECT input, output, score,\n" f" semantic_distance(input, 'API调用分析') AS similarity\n" f"FROM {DATASET_NAME}\n" f"WHERE semantic_distance(input, 'API调用分析') < 0.25\n" f" AND score > 0.8\n" f"ORDER BY similarity ASC" ) result = query(client, workspace, q) show_results("semantic_distance + 条件过滤: 语义相似且 score > 0.8", q, result) # ============================================================================= # 示例 3: SQL 分析 # ============================================================================= def example_sql_analysis(client: Client, workspace: str): """ SQL 分析——多维统计支持标准 SQL：SELECT / WHERE / GROUP BY / ORDER BY / LIMIT / CTE / 子查询等 """ # 3a. 分组统计 q = ( f"SELECT model,\n" f" count(*) AS total,\n" f" avg(score) AS avg_score,\n" f" min(score) AS min_score,\n" f" max(score) AS max_score\n" f"FROM {DATASET_NAME}\n" f"GROUP BY model\n" f"ORDER BY total DESC" ) result = query(client, workspace, q) show_results("SQL 分析: 各模型数据量与评分统计", q, result) # 3b. 条件过滤 + 排序 q = ( f"SELECT input, output, model, score\n" f"FROM {DATASET_NAME}\n" f"WHERE score >= 0.9\n" f"ORDER BY score DESC" ) result = query(client, workspace, q) show_results("SQL 分析: score >= 0.9 的高质量数据", q, result) # ============================================================================= # 示例 4: 组合查询（全文 + SQL + semantic_distance） # ============================================================================= def example_combined_query(client: Client, workspace: str): """ 组合查询四种查询模式可通过管道 | 自由组合： <全文条件> AND <语义条件> | semantic_distance 在 SQL 的 WHERE / ORDER BY 中使用。 """ # 4a. 全文 + SQL 排序 q = ( f"model:qwen-plus\n" f"| SELECT input, output, score\n" f" FROM {DATASET_NAME}\n" f" ORDER BY score DESC\n" f" LIMIT 5" ) result = query(client, workspace, q) show_results("组合查询: 全文过滤 + SQL 排序", q, result) # 4b. 全文 + SQL + semantic_distance q = ( f"model:qwen-plus\n" f"| SELECT input, output, score\n" f" FROM {DATASET_NAME}\n" f" WHERE semantic_distance(input, '数据分析') < 0.4\n" f" ORDER BY score DESC\n" f" LIMIT 5" ) result = query(client, workspace, q) show_results("组合查询: 全文 + semantic_distance + SQL", q, result) # 4c. similarity + SQL q = ( f"similarity(input, '日志分析') < 0.3\n" f"| SELECT input, score FROM {DATASET_NAME}\n" f" ORDER BY score DESC LIMIT 5" ) result = query(client, workspace, q) show_results("组合查询: similarity 检索 + SQL", q, result) # ============================================================================= # 示例 5: 分页查询 # ============================================================================= def example_pagination(client: Client, workspace: str): """ 分页查询使用 LIMIT offset, count 语法： - LIMIT count 返回前 count 条 - LIMIT offset, count 跳过 offset 条，返回 count 条 """ print(f"\n{'=' * 60}") print(" 分页查询: 每页 2 条，遍历前 3 页") print(f"{'=' * 60}") print("语法: LIMIT offset, count") page_size = 2 for page in range(1, 4): skip = (page - 1) * page_size sql = f"SELECT id, input, score FROM {DATASET_NAME} ORDER BY score DESC LIMIT {skip}, {page_size}" result = query(client, workspace, sql) data = result.get("data", []) print(f"\n Query: {sql}") print(f" 第 {page} 页 (LIMIT {skip}, {page_size}): 返回 {len(data)} 条") for item in data: print(f" [{item.get('score', '')}] {item.get('input', '')[:40]}") # ============================================================================= # 主函数 # ============================================================================= def main(): client = get_client() workspace = os.getenv("ALIBABA_CLOUD_CMS_WORKSPACE") print(f"AgentLoop Dataset 数据查询示例 (dataset: {DATASET_NAME})") print("=" * 60) example_fulltext_search(client, workspace) example_semantic_search(client, workspace) example_sql_analysis(client, workspace) example_combined_query(client, workspace) example_pagination(client, workspace) print("\n" + "=" * 60) print("查询示例完成") if __name__ == "__main__": main()