# -*- coding: utf-8 -*-
"""
AgentLoop Dataset 数据查询示例

演示 Dataset 支持的四种查询模式：
1. 全文检索      field:keyword 关键词匹配
2. 语义搜索      similarity()（检索语法）/ semantic_distance()（SQL 函数）
3. SQL 分析      标准 SQL (GROUP BY / WHERE / ORDER BY)
4. 组合查询      检索条件 | SQL 语句，自由组合

查询语法：
    ① 全文检索:     field:keyword
    ② 语义搜索:     similarity(field, 'text') < threshold
    ③ SQL 分析:     SELECT ... FROM dataset_name ...
    ④ 组合查询:     <检索条件> | <SQL 语句>

语义搜索提供两种形式：
    ▸ 检索语法: similarity(field, 'text') < threshold   → 用于 | 左侧检索条件
    ▸ SQL 函数: semantic_distance(field, 'text')        → 用于 | 右侧 SQL 语句中

所有查询统一通过 ExecuteQuery(type="SQL") 接口执行。

使用前请设置环境变量：
- SLSDEMO_ALIYUN_ACCESS_KEY_ID
- SLSDEMO_ALIYUN_ACCESS_KEY_SECRET
- ALIBABA_CLOUD_CMS_ENDPOINT   (如: cms.cn-shanghai.aliyuncs.com)
- ALIBABA_CLOUD_CMS_WORKSPACE
"""

import json
import os

from alibabacloud_cms20240330.client import Client
from alibabacloud_cms20240330.models import ExecuteQueryRequest
from alibabacloud_tea_openapi.models import Config
from dotenv import load_dotenv

load_dotenv()

DATASET_NAME = "dataset_demo"


def get_client() -> Client:
    config = Config(
        access_key_id=os.getenv("SLSDEMO_ALIYUN_ACCESS_KEY_ID"),
        access_key_secret=os.getenv("SLSDEMO_ALIYUN_ACCESS_KEY_SECRET"),
        endpoint=os.getenv("ALIBABA_CLOUD_CMS_ENDPOINT"),
    )
    return Client(config)


def query(client: Client, workspace: str, query_str: str) -> dict:
    """统一查询入口（type 固定为 SQL）"""
    request = ExecuteQueryRequest(query=query_str, type="SQL")
    response = client.execute_query(workspace, DATASET_NAME, request)
    return response.body.to_map()


def show_results(title: str, query_str: str, result: dict, max_items: int = 5):
    """格式化打印查询语句和结果"""
    print(f"\n{'=' * 60}")
    print(f"  {title}")
    print(f"{'=' * 60}")

    lines = [l.strip() for l in query_str.strip().splitlines() if l.strip()]
    if len(lines) == 1:
        print(f"Query: {lines[0]}")
    else:
        print("Query:")
        for line in lines:
            print(f"  {line}")

    meta = result.get("meta", {})
    data = result.get("data", [])
    count = meta.get("count", len(data))
    elapsed = meta.get("elapsedMillisecond", "?")
    print(f"\n返回 {count} 条结果, 耗时 {elapsed}ms")

    for i, item in enumerate(data[:max_items]):
        print(f"\n--- 第 {i + 1} 条 ---")
        print(json.dumps(item, indent=2, ensure_ascii=False))

    if len(data) > max_items:
        print(f"\n... 还有 {len(data) - max_items} 条结果未显示")


# =============================================================================
# 示例 1: 全文检索
# =============================================================================

def example_fulltext_search(client: Client, workspace: str):
    """
    全文检索——关键词匹配

    语法与 SLS 查询语法一致：
    - field:keyword             单字段匹配
    - field1:v1 AND field2:v2   组合条件
    - field:v1 OR field:v2      或条件
    - score > 0.9               数值范围

    可单独使用，也可通过管道 | 与 SQL 组合。
    """
    # 1a. 全文检索: 只需检索表达式
    q = "input:错误"
    result = query(client, workspace, q)
    show_results("全文检索: input 包含 '错误'", q, result)

    # 1b. 全文 + SQL: 通过管道选择字段、排序
    q = f"input:错误 | SELECT input, score FROM {DATASET_NAME} ORDER BY score DESC LIMIT 5"
    result = query(client, workspace, q)
    show_results("全文检索 + SQL: 过滤字段并排序", q, result)

    # 1c. 多条件全文检索
    q = "model:qwen-plus AND score > 0.9"
    result = query(client, workspace, q)
    show_results("全文检索: model=qwen-plus 且 score > 0.9", q, result)


# =============================================================================
# 示例 2: 语义搜索
# =============================================================================

def example_semantic_search(client: Client, workspace: str):
    """
    语义搜索——向量相似度

    提供两种使用形式：

    形式一 - 检索语法 similarity()：
    - 用于 | 左侧检索条件中
    - similarity(field, 'query_text') < threshold
    - 可与全文检索条件通过 AND/OR 组合

    形式二 - SQL 函数 semantic_distance()：
    - 用于 | 右侧 SQL 语句中
    - 可在 SELECT（获取距离值）、WHERE（过滤）、ORDER BY（排序）中使用

    两种形式的阈值范围均为 0-1，值越小越相似。
    建议阈值：0.1-0.2（严格），0.2-0.3（常规），0.3-0.5（宽松）

    前提：对应字段已在 Schema 中配置 embedding 向量索引
    """
    # 2a. 检索语法 similarity(): 用于 | 左侧
    q = f"similarity(input, '日志查询统计') < 0.3"
    result = query(client, workspace, q)
    show_results("语义搜索 (similarity 检索语法): 与 '日志查询统计' 相似", q, result)

    # 2b. similarity() + 全文检索组合
    q = f"similarity(input, 'API调用分析') < 0.3 AND score > 0.8"
    result = query(client, workspace, q)
    show_results("语义搜索 + 全文: 语义相似且 score > 0.8", q, result)

    # 2c. SQL 函数 semantic_distance(): 用于 | 右侧 SQL
    q = (
        f"SELECT input, semantic_distance(input, '日志查询统计') AS similarity\n"
        f"FROM {DATASET_NAME}\n"
        f"WHERE semantic_distance(input, '日志查询统计') < 0.3\n"
        f"ORDER BY semantic_distance(input, '日志查询统计') ASC"
    )
    result = query(client, workspace, q)
    show_results("语义搜索 (semantic_distance SQL 函数): 精确距离分析", q, result)

    # 2d. semantic_distance() + 条件过滤
    q = (
        f"SELECT input, output, score,\n"
        f"       semantic_distance(input, 'API调用分析') AS similarity\n"
        f"FROM {DATASET_NAME}\n"
        f"WHERE semantic_distance(input, 'API调用分析') < 0.25\n"
        f"  AND score > 0.8\n"
        f"ORDER BY similarity ASC"
    )
    result = query(client, workspace, q)
    show_results("semantic_distance + 条件过滤: 语义相似且 score > 0.8", q, result)


# =============================================================================
# 示例 3: SQL 分析
# =============================================================================

def example_sql_analysis(client: Client, workspace: str):
    """
    SQL 分析——多维统计

    支持标准 SQL：SELECT / WHERE / GROUP BY / ORDER BY / LIMIT / CTE / 子查询等
    """
    # 3a. 分组统计
    q = (
        f"SELECT model,\n"
        f"       count(*) AS total,\n"
        f"       avg(score) AS avg_score,\n"
        f"       min(score) AS min_score,\n"
        f"       max(score) AS max_score\n"
        f"FROM {DATASET_NAME}\n"
        f"GROUP BY model\n"
        f"ORDER BY total DESC"
    )
    result = query(client, workspace, q)
    show_results("SQL 分析: 各模型数据量与评分统计", q, result)

    # 3b. 条件过滤 + 排序
    q = (
        f"SELECT input, output, model, score\n"
        f"FROM {DATASET_NAME}\n"
        f"WHERE score >= 0.9\n"
        f"ORDER BY score DESC"
    )
    result = query(client, workspace, q)
    show_results("SQL 分析: score >= 0.9 的高质量数据", q, result)


# =============================================================================
# 示例 4: 组合查询（全文 + SQL + semantic_distance）
# =============================================================================

def example_combined_query(client: Client, workspace: str):
    """
    组合查询

    四种查询模式可通过管道 | 自由组合：
        <全文条件> AND <语义条件> | <SQL 语句>

    semantic_distance 在 SQL 的 WHERE / ORDER BY 中使用。
    """
    # 4a. 全文 + SQL 排序
    q = (
        f"model:qwen-plus\n"
        f"| SELECT input, output, score\n"
        f"  FROM {DATASET_NAME}\n"
        f"  ORDER BY score DESC\n"
        f"  LIMIT 5"
    )
    result = query(client, workspace, q)
    show_results("组合查询: 全文过滤 + SQL 排序", q, result)

    # 4b. 全文 + SQL + semantic_distance
    q = (
        f"model:qwen-plus\n"
        f"| SELECT input, output, score\n"
        f"  FROM {DATASET_NAME}\n"
        f"  WHERE semantic_distance(input, '数据分析') < 0.4\n"
        f"  ORDER BY score DESC\n"
        f"  LIMIT 5"
    )
    result = query(client, workspace, q)
    show_results("组合查询: 全文 + semantic_distance + SQL", q, result)

    # 4c. similarity + SQL
    q = (
        f"similarity(input, '日志分析') < 0.3\n"
        f"| SELECT input, score FROM {DATASET_NAME}\n"
        f"  ORDER BY score DESC LIMIT 5"
    )
    result = query(client, workspace, q)
    show_results("组合查询: similarity 检索 + SQL", q, result)


# =============================================================================
# 示例 5: 分页查询
# =============================================================================

def example_pagination(client: Client, workspace: str):
    """
    分页查询

    使用 LIMIT offset, count 语法：
    - LIMIT count              返回前 count 条
    - LIMIT offset, count      跳过 offset 条，返回 count 条
    """
    print(f"\n{'=' * 60}")
    print("  分页查询: 每页 2 条，遍历前 3 页")
    print(f"{'=' * 60}")
    print("语法: LIMIT offset, count")

    page_size = 2
    for page in range(1, 4):
        skip = (page - 1) * page_size
        sql = f"SELECT id, input, score FROM {DATASET_NAME} ORDER BY score DESC LIMIT {skip}, {page_size}"
        result = query(client, workspace, sql)
        data = result.get("data", [])
        print(f"\n  Query: {sql}")
        print(f"  第 {page} 页 (LIMIT {skip}, {page_size}): 返回 {len(data)} 条")
        for item in data:
            print(f"    [{item.get('score', '')}] {item.get('input', '')[:40]}")


# =============================================================================
# 主函数
# =============================================================================

def main():
    client = get_client()
    workspace = os.getenv("ALIBABA_CLOUD_CMS_WORKSPACE")

    print(f"AgentLoop Dataset 数据查询示例 (dataset: {DATASET_NAME})")
    print("=" * 60)

    example_fulltext_search(client, workspace)
    example_semantic_search(client, workspace)
    example_sql_analysis(client, workspace)
    example_combined_query(client, workspace)
    example_pagination(client, workspace)

    print("\n" + "=" * 60)
    print("查询示例完成")


if __name__ == "__main__":
    main()
