From faaa8e3802c6db99af7cc45f22aa7674c8a5cca9 Mon Sep 17 00:00:00 2001 From: jack Date: Thu, 9 Apr 2026 14:24:08 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E8=A7=A3=E7=A0=81=E6=A8=A1?= =?UTF-8?q?=E6=9D=BF,=20=E6=A3=80=E6=B5=8B=E5=A4=A7=E6=8B=AC=E5=8F=B7,=20?= =?UTF-8?q?=E5=A6=82=E6=9E=9C=E5=8D=A0=E4=BD=8D=E7=AC=A6=E5=A4=A7=E6=8B=AC?= =?UTF-8?q?=E5=8F=B7=E4=B8=8D=E5=AD=98=E5=9C=A8,=20=E5=88=99=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E6=B7=BB=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- alpha_submit/main.go | 261 +++ alpha_submit/main.py | 168 ++ check_llm_idea/add_braces_to_fields.py | 166 ++ check_llm_idea/check_llm_idea.py | 199 +++ check_llm_idea/llm_idea.md | 364 ++++ check_llm_idea/prompt.md | 1479 +++++++++++++++++ .../synchronize_alpha_performance_data.py | 366 ++++ test/wqb-login/login.py | 2 +- 8 files changed, 3004 insertions(+), 1 deletion(-) create mode 100644 alpha_submit/main.go create mode 100644 alpha_submit/main.py create mode 100644 check_llm_idea/add_braces_to_fields.py create mode 100644 check_llm_idea/check_llm_idea.py create mode 100644 check_llm_idea/llm_idea.md create mode 100644 check_llm_idea/prompt.md create mode 100644 temporary_script/synchronize_alpha_performance_data.py diff --git a/alpha_submit/main.go b/alpha_submit/main.go new file mode 100644 index 0000000..fd06fcd --- /dev/null +++ b/alpha_submit/main.go @@ -0,0 +1,261 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "strconv" + "strings" + "time" +) + +const ( + totalRetryCount = 100000 + nacosURL = "http://192.168.31.41:30848/nacos/v1/cs/configs?dataId=wq_account&group=quantify" +) + +// NacosConfig 对应 nacos 返回的账号配置 +type NacosConfig struct { + UserName string `json:"user_name"` + Password string `json:"password"` +} + +// basicAuthTransport 为每个请求自动添加 Basic Auth 头 +type basicAuthTransport struct { + username string + password string + base http.RoundTripper +} + +func (t *basicAuthTransport) RoundTrip(req *http.Request) (*http.Response, error) { + req.SetBasicAuth(t.username, t.password) + return t.base.RoundTrip(req) +} + +// Login 登录 WorldQuant Brain API,返回带 BasicAuth 的 HTTP Client +func Login() (*http.Client, error) { + // 1. 从 nacos 获取账号密码 + resp, err := http.Get(nacosURL) + if err != nil { + log.Printf("获取账号配置失败: %v", err) + return nil, err + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("nacos 返回非 200 状态码: %d", resp.StatusCode) + } + + var config NacosConfig + if err := json.NewDecoder(resp.Body).Decode(&config); err != nil { + return nil, fmt.Errorf("解析 nacos 配置失败: %w", err) + } + + log.Printf("正在登录账户: %s", config.UserName) + + // 2. 创建 HTTP Client,后续所有请求都会使用 BasicAuth + client := &http.Client{ + Timeout: 30 * time.Second, + Transport: &basicAuthTransport{ + username: config.UserName, + password: config.Password, + base: http.DefaultTransport, + }, + } + + // 3. 发送登录请求 + loginReq, _ := http.NewRequest("POST", "https://api.worldquantbrain.com/authentication", nil) + loginResp, err := client.Do(loginReq) + if err != nil { + return nil, fmt.Errorf("登录请求失败: %w", err) + } + defer loginResp.Body.Close() + + log.Printf("登录状态: %d", loginResp.StatusCode) + + if loginResp.StatusCode == http.StatusCreated { + log.Println("登录成功!") + return client, nil + } + + body, _ := io.ReadAll(loginResp.Body) + return nil, fmt.Errorf("登录失败: %s", string(body)) +} + +// SubmitAlpha 提交 alpha,自动重试直到成功或达到最大重试次数 +// 返回 nil 表示提交成功,否则返回错误 +func SubmitAlpha(alphaID string) error { + retryCount := 0 + var client *http.Client + + for retryCount < totalRetryCount { + // 如果没有有效 client,则重新登录 + if client == nil { + var err error + client, err = Login() + if err != nil { + log.Printf("登录失败: %v, 10秒后重试 (重试次数: %d)", err, retryCount) + time.Sleep(10 * time.Second) + retryCount++ + continue + } + } + + url := fmt.Sprintf("https://api.worldquantbrain.com/alphas/%s/submit", alphaID) + log.Printf("请求 URL: %s", url) + + // 发送提交请求 + resp, err := client.Post(url, "application/json", nil) + if err != nil { + log.Printf("网络请求异常 (alpha=%s, retry=%d): %v", alphaID, retryCount, err) + retryCount++ + time.Sleep(10 * time.Second) + continue + } + + // 处理 400 特殊情形:已提交,进入轮询 + if resp.StatusCode == http.StatusBadRequest { + bodyBytes, _ := io.ReadAll(resp.Body) + resp.Body.Close() + bodyStr := string(bodyBytes) + if strings.Contains(bodyStr, "The plain HTTP request was sent to HTTPS port") { + log.Println("Alpha 已提交,正在轮询状态...") + pollInterval := 1.0 // 秒 + for { + time.Sleep(time.Duration(pollInterval) * time.Second) + fmt.Print(".") + // 重新 GET 查询状态 + pollResp, err := client.Get(url) + if err != nil { + log.Printf("轮询请求失败: %v", err) + break + } + // 检查 Retry-After 头 + if retryAfter := pollResp.Header.Get("Retry-After"); retryAfter != "" { + if f, err := strconv.ParseFloat(retryAfter, 64); err == nil && f > 0 { + pollInterval = max(f, 3.0) + } else { + pollInterval = 3.0 + } + } else { + pollInterval = 3.0 + } + // 当状态不再是 400 且不包含该特定消息时,退出轮询 + if pollResp.StatusCode != http.StatusBadRequest { + resp = pollResp + break + } + bodyBytes2, _ := io.ReadAll(pollResp.Body) + pollResp.Body.Close() + if !strings.Contains(string(bodyBytes2), "The plain HTTP request was sent to HTTPS port") { + resp = pollResp + break + } + pollResp.Body.Close() + } + log.Printf("轮询结束,最终状态码: %d", resp.StatusCode) + } else { + // 非特殊 400,按普通错误处理 + resp.Body.Close() + } + } + + // 确保 resp 不为 nil(如果上面分支没有设置 resp,则跳过本次循环) + if resp == nil { + retryCount++ + continue + } + + // 根据状态码处理 + switch resp.StatusCode { + case http.StatusTooManyRequests: // 429 + log.Println("触发限流 (429),休眠 60 秒后重试") + resp.Body.Close() + time.Sleep(60 * time.Second) + retryCount++ + continue + + case http.StatusUnauthorized: // 401 + log.Println("认证失效,重新登录") + resp.Body.Close() + client = nil + retryCount++ + continue + + case http.StatusNotFound: // 404 + log.Printf("Alpha %s 不存在或超时,重试 (%d/%d)", alphaID, retryCount+1, totalRetryCount) + resp.Body.Close() + retryCount++ + continue + + case http.StatusForbidden: // 403 + log.Printf("%s 提交失败 (403)", alphaID) + var failChecks []map[string]interface{} + var bodyMap map[string]interface{} + if err := json.NewDecoder(resp.Body).Decode(&bodyMap); err == nil { + if isObj, ok := bodyMap["is"].(map[string]interface{}); ok { + if checks, ok := isObj["checks"].([]interface{}); ok { + for _, c := range checks { + if ch, ok := c.(map[string]interface{}); ok { + if result, ok := ch["result"]; ok && result == "FAIL" { + failChecks = append(failChecks, ch) + } + } + } + } + } + } + resp.Body.Close() + log.Printf("失败的检查项: %v", failChecks) + // 如果因为提交次数超限而失败,放弃重试 + for _, ch := range failChecks { + if name, ok := ch["name"]; ok { + if name == "REGULAR_SUBMISSION" || name == "SUPER_SUBMISSION" { + return fmt.Errorf("提交次数超过限制: %v", failChecks) + } + } + } + return fmt.Errorf("提交失败,HTTP 403") + + case http.StatusOK: // 200 + log.Printf("%s 提交成功", alphaID) + resp.Body.Close() + return nil + + default: + // 处理 5xx 错误 + if resp.StatusCode >= 500 && resp.StatusCode < 600 { + log.Printf("服务器错误 %d,5 秒后重试", resp.StatusCode) + resp.Body.Close() + time.Sleep(5 * time.Second) + retryCount++ + continue + } + // 其他非 2xx 状态码视为失败 + bodyBytes, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return fmt.Errorf("未处理的响应状态码 %d: %s", resp.StatusCode, string(bodyBytes)) + } + } + + return fmt.Errorf("达到最大重试次数 %d,提交失败", totalRetryCount) +} + +// max 返回两个 float64 中的较大值 +func max(a, b float64) float64 { + if a > b { + return a + } + return b +} + +func main() { + // 使用示例 + alphaID := "your_alpha_id_here" + if err := SubmitAlpha(alphaID); err != nil { + log.Fatalf("提交失败: %v", err) + } + log.Println("提交完成") +} diff --git a/alpha_submit/main.py b/alpha_submit/main.py new file mode 100644 index 0000000..dbd15c2 --- /dev/null +++ b/alpha_submit/main.py @@ -0,0 +1,168 @@ +import time +import logging +import httpx +from httpx import BasicAuth + +# 配置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +def login(): + """ + 登录 WorldQuant Brain API,返回 httpx.Client 对象。 + 注意:登录成功后 client 已包含认证信息,无需额外处理。 + """ + # 从 nacos 获取账号密码 + nacos_resp = httpx.get('http://192.168.31.41:30848/nacos/v1/cs/configs?dataId=wq_account&group=quantify') + if nacos_resp.status_code != 200: + logger.error('获取账号密码失败') + return None + + config = nacos_resp.json() + username = config['user_name'] + password = config['password'] + + logger.info(f"正在登录账户: {username}") + + # 创建客户端并认证 + client = httpx.Client(auth=BasicAuth(username, password)) + + # 发送登录请求 + response = client.post('https://api.worldquantbrain.com/authentication') + logger.info(f"登录状态: {response.status_code}") + + if response.status_code == 201: + logger.info("登录成功!") + logger.debug(response.json()) + return client + else: + logger.error(f"登录失败: {response.json()}") + client.close() + return None + + +def submit_alpha(alpha_id): + """ + 提交 alpha,自动重试直到成功或达到最大重试次数。 + 返回 True 表示提交成功,False 表示最终失败。 + """ + TOTAL_RETRY_COUNT = 100000 + retry_count = 0 + client = None + + while retry_count < TOTAL_RETRY_COUNT: + # 如果没有有效 client,则重新登录 + if client is None: + client = login() + if client is None: + logger.error("登录失败,等待 10 秒后重试") + time.sleep(10) + retry_count += 1 + continue + + try: + url = f"https://api.worldquantbrain.com/alphas/{alpha_id}/submit" + logger.debug(f'url: {url}') + + # 1. 发送提交请求 + res = client.post(url) + + # 处理特殊情况:已经提交过(需要轮询结果) + if res.status_code == 400 and "The plain HTTP request was sent to HTTPS port" in res.text: + logger.info("Alpha 已提交,正在轮询状态...") + # 轮询获取最终结果 + poll_interval = 1.0 # 初始轮询间隔 + while True: + time.sleep(poll_interval) + print(".", end="", flush=True) + # 使用 GET 请求查询当前状态 + poll_res = client.get(url) + # 如果服务器返回了 Retry-After,则使用它 + if "retry-after" in poll_res.headers: + poll_interval = max(float(poll_res.headers["retry-after"]), 3) + else: + poll_interval = 3 # 默认间隔 + # 当状态不再是 400(处理中)时,退出轮询 + if poll_res.status_code != 400 or "The plain HTTP request was sent to HTTPS port" not in poll_res.text: + res = poll_res + break + logger.info(f"轮询结束,最终状态码: {res.status_code}") + + # 2. 处理各种状态码 + if res.status_code == 429: + logger.info("触发限流 (429),休眠 60 秒后重试") + time.sleep(60) + retry_count += 1 + continue + + if res.status_code == 401: + logger.warning("认证失效,重新登录") + if client: + client.close() + client = None + retry_count += 1 + continue + + if res.status_code == 404: + logger.warning(f"Alpha {alpha_id} 不存在或超时,重试 ({retry_count+1}/{TOTAL_RETRY_COUNT})") + retry_count += 1 + continue + + if res.status_code // 100 == 5: + logger.warning(f"服务器错误 {res.status_code},5 秒后重试") + time.sleep(5) + retry_count += 1 + continue + + if res.status_code == 403: + logger.info(f"{alpha_id} 提交失败 (403)") + fail_checks = [] + try: + checks = res.json()["is"]["checks"] + fail_checks = [x for x in checks if x.get("result") == "FAIL"] + except Exception as e: + logger.error(f"解析失败原因时出错: {e}") + + logger.info(f"失败的检查项: {fail_checks}") + # 如果是提交次数超限,则直接退出,不再重试 + if any(x.get("name") in ["REGULAR_SUBMISSION", "SUPER_SUBMISSION"] for x in fail_checks): + logger.error("提交次数超过限制,放弃重试") + return False + # 其他 403 错误也视为永久失败 + return False + + if res.status_code == 200: + logger.info(f"{alpha_id} 提交成功") + return True + + # 其他非 2xx 状态码,视为未知错误,直接退出 + logger.error(f"未处理的响应状态码 {res.status_code},放弃重试。响应内容: {res.text[:200]}") + return False + + except httpx.RequestError as e: + logger.error(f"网络请求异常 (alpha_id={alpha_id}, retry={retry_count}): {e}") + retry_count += 1 + time.sleep(10) + continue + except Exception as e: + logger.error(f"未预期的异常 (alpha_id={alpha_id}): {e}") + return False + + # 超过最大重试次数 + logger.error(f"达到最大重试次数 {TOTAL_RETRY_COUNT},提交失败") + return False + + +# 使用示例 +if __name__ == "__main__": + # 测试提交一个 alpha + alpha_id = "your_alpha_id_here" + success = submit_alpha(alpha_id) + if success: + print("提交完成") + else: + print("提交失败") \ No newline at end of file diff --git a/check_llm_idea/add_braces_to_fields.py b/check_llm_idea/add_braces_to_fields.py new file mode 100644 index 0000000..bf4f87c --- /dev/null +++ b/check_llm_idea/add_braces_to_fields.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +脚本功能:读取 llm_idea.md 文件, +检查 **Implementation Example** 中的数据字段是否缺少大括号, +如果缺少则添加大括号,然后回写文件。 +""" + +import re +from pathlib import Path + + +def get_function_names(): + """获取已知的函数名列表""" + return { + 'divide', 'subtract', 'add', 'abs', 'power', + 'greater', 'greater_equal', 'equal', 'and', 'or', 'not', + 'ts_delay', 'ts_mean', 'ts_std_dev', 'ts_sum', 'ts_backfill', + 'group_rank', 'group_neutralize', 'zscore', + 'days_from_last_change' + } + + +def get_keywords(): + """获取关键字列表""" + return { + 'True', 'False', 'None', 'and', 'or', 'not', 'if', 'else', + 'for', 'while', 'return', 'in', 'is', 'lambda' + } + + +def add_braces_to_expression(expression): + """ + 给表达式中的数据字段添加大括号 + + 使用更健壮的方法: + 1. 找出所有已经被大括号包围的部分,标记为已处理 + 2. 找出所有未被包围的变量,添加大括号 + + Args: + expression: 原始表达式字符串 + + Returns: + str: 添加了大括号后的表达式 + """ + func_names = get_function_names() + keywords = get_keywords() + + # 创建结果列表(用于构建最终字符串) + result = [] + i = 0 + length = len(expression) + + while i < length: + # 如果当前字符是 '{',说明已经是大括号包围的内容,直接跳过到匹配的 '}' + if expression[i] == '{': + # 找到对应的 '}' + j = expression.find('}', i) + if j != -1: + # 保留原有的大括号内容 + result.append(expression[i:j+1]) + i = j + 1 + else: + # 没有找到闭合的 '}',当作普通字符处理 + result.append(expression[i]) + i += 1 + elif expression[i].isalpha() or expression[i] == '_': + # 可能是变量名的开始 + j = i + while j < length and (expression[j].isalnum() or expression[j] == '_'): + j += 1 + + word = expression[i:j] + + # 检查是否是变量(不是函数名、关键字、数字) + if (word and + not word[0].isdigit() and + word not in func_names and + word not in keywords): + # 这是一个变量,添加大括号 + result.append('{' + word + '}') + else: + # 不是变量,保持原样 + result.append(word) + + i = j + else: + # 其他字符(数字、运算符、括号等),直接保留 + result.append(expression[i]) + i += 1 + + return ''.join(result) + + +def process_file(file_path): + """ + 处理文件,给 Implementation Example 中的数据字段添加大括号 + + Args: + file_path: Path对象,指向要处理的文件 + + Returns: + bool: 是否成功处理 + """ + try: + content = file_path.read_text(encoding='utf-8') + except FileNotFoundError: + print(f"错误:找不到文件 {file_path}") + return False + except Exception as e: + print(f"读取文件时出错:{e}") + return False + + # 正则匹配 **Implementation Example**: 后面的内容 + pattern = r'(\*\*Implementation Example\*\*:\s*`(.*?)`)' + + def replace_match(match): + """替换函数:对匹配到的内容进行处理""" + prefix = match.group(1).split('`')[0] + '`' # **Implementation Example**: ` + original_expr = match.group(2) # 表达式内容 + suffix = '`' # 结束的 ` + + # 给表达式添加大括号 + new_expr = add_braces_to_expression(original_expr) + + return prefix + new_expr + suffix + + # 执行替换 + new_content = re.sub(pattern, replace_match, content, flags=re.DOTALL) + + # 检查是否有变化 + if new_content != content: + try: + file_path.write_text(new_content, encoding='utf-8') + print(f"✓ 成功更新文件:{file_path}") + return True + except Exception as e: + print(f"写入文件时出错:{e}") + return False + else: + print("✓ 文件无需更改(所有数据字段已有大括号)") + return True + + +def main(): + # 获取当前目录下的 llm_idea.md 文件 + current_dir = Path(__file__).parent + file_path = current_dir / "llm_idea.md" + + print("=" * 80) + print("Implementation Example 字段大括号补全工具") + print("=" * 80) + print(f"\n正在处理文件:{file_path}\n") + + # 处理文件 + success = process_file(file_path) + + if success: + print("\n" + "=" * 80) + print("处理完成!") + print("=" * 80) + else: + print("\n处理失败!") + + +if __name__ == "__main__": + main() diff --git a/check_llm_idea/check_llm_idea.py b/check_llm_idea/check_llm_idea.py new file mode 100644 index 0000000..62dbf5b --- /dev/null +++ b/check_llm_idea/check_llm_idea.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +""" +脚本功能:读取当前目录下的 llm_idea.md 文件, +匹配 **Implementation Example**: 后面的内容, +并提取每个示例中函数调用的参数(变量名) +""" + +import re +from pathlib import Path + + +def extract_implementation_examples(file_path): + """ + 从markdown文件中提取所有Implementation Example的内容 + + Args: + file_path: Path对象,指向要读取的文件 + + Returns: + list: 匹配到的所有示例内容列表 + """ + try: + content = file_path.read_text(encoding='utf-8') + except FileNotFoundError: + print(f"错误:找不到文件 {file_path}") + return [] + except Exception as e: + print(f"读取文件时出错:{e}") + return [] + + # 正则匹配 **Implementation Example**: 后面的内容 + pattern = r'\*\*Implementation Example\*\*:\s*`(.*?)`' + matches = re.findall(pattern, content, re.DOTALL) + + return matches + +def extract_variables_from_expression(expression): + """ + 从表达式中提取所有变量名(函数参数中的变量) + 排除数字常量(如 1, 5, 20, 60 等) + + Args: + expression: 函数调用表达式字符串 + + Returns: + list: 提取到的变量名列表(保持出现顺序) + """ + # 匹配函数调用中的参数:函数名(参数1, 参数2, ...) + # 使用正则匹配所有在括号内、由逗号分隔的内容 + variables = [] + + # 方法1:匹配所有函数调用括号内的内容 + # 这个正则匹配函数名(参数列表) + func_call_pattern = r'(\w+)\s*\(([^()]*(?:\([^()]*\)[^()]*)*)\)' + + def extract_from_text(text): + """递归提取文本中的变量""" + # 查找所有函数调用 + matches = re.finditer(func_call_pattern, text) + for match in matches: + func_name = match.group(1) + args_str = match.group(2) + + # 分割参数(考虑嵌套括号) + args = split_args_keeping_nesting(args_str) + + # 处理每个参数 + for arg in args: + arg = arg.strip() + # 如果参数是数字,跳过 + if re.match(r'^-?\d+(\.\d+)?$', arg): + continue + # 如果参数是函数调用,递归提取 + if '(' in arg and ')' in arg: + extract_from_text(arg) + # 如果参数是变量名(字母、数字、下划线组成,以字母或下划线开头) + elif re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', arg): + if arg not in variables: # 去重但保持顺序 + variables.append(arg) + # 如果是复杂表达式(包含运算符),也尝试提取其中的变量 + elif any(op in arg for op in ['+', '-', '*', '/', '>', '<', '=']): + extract_variables_from_complex_expression(arg) + + def split_args_keeping_nesting(args_str): + """ + 分割函数参数,考虑嵌套的括号 + 例如: "a, b, c" -> ['a', 'b', 'c'] + "func(a,b), c" -> ['func(a,b)', 'c'] + """ + args = [] + current_arg = [] + paren_count = 0 + bracket_count = 0 # 方括号 + brace_count = 0 # 花括号 + + for char in args_str: + if char == ',' and paren_count == 0 and bracket_count == 0 and brace_count == 0: + args.append(''.join(current_arg).strip()) + current_arg = [] + else: + current_arg.append(char) + if char == '(': + paren_count += 1 + elif char == ')': + paren_count -= 1 + elif char == '[': + bracket_count += 1 + elif char == ']': + bracket_count -= 1 + elif char == '{': + brace_count += 1 + elif char == '}': + brace_count -= 1 + + if current_arg: + args.append(''.join(current_arg).strip()) + + return args + + def extract_variables_from_complex_expression(expr): + """从复杂表达式中提取变量(如 a + b * c)""" + # 匹配变量名(字母或下划线开头,后面跟字母、数字、下划线) + var_pattern = r'\b[a-zA-Z_][a-zA-Z0-9_]*\b' + # 排除常见的函数名和关键字 + keywords = {'and', 'or', 'not', 'if', 'else', 'for', 'while', 'return', + 'True', 'False', 'None', 'in', 'is', 'lambda'} + + for match in re.finditer(var_pattern, expr): + var = match.group() + if var not in keywords and not re.match(r'^\d+$', var): + if var not in variables: + variables.append(var) + + # 开始提取 + extract_from_text(expression) + + # 如果上面没提取到,尝试直接匹配简单变量(没有函数调用的表达式) + if not variables: + # 匹配简单的变量名 + simple_var_pattern = r'\b[a-zA-Z_][a-zA-Z0-9_]*\b' + keywords = {'and', 'or', 'not', 'if', 'else', 'for', 'while', 'return', + 'True', 'False', 'None', 'divide', 'subtract', 'add', 'abs', + 'greater', 'equal', 'ts_delay', 'ts_mean', 'ts_std_dev', + 'count_bias_adjusted_price_target_estimates', 'group_rank', + 'zscore', 'days_from_last_change', 'ts_sum', 'power'} + + for match in re.finditer(simple_var_pattern, expression): + var = match.group() + # 排除数字和函数名 + if (not re.match(r'^\d+$', var) and + var not in keywords and + var not in ['and', 'or', 'not']): + if var not in variables: + variables.append(var) + + return variables + +def main(): + # 使用Path模块获取当前文件路径 + current_dir = Path.cwd() + file_path = current_dir / "llm_idea.md" + + print(f"正在读取文件:{file_path}") + print("=" * 80) + + # 提取所有Implementation Example + examples = extract_implementation_examples(file_path) + + if not examples: + print("未找到匹配的 **Implementation Example**: `...` 内容") + return + + print(f"找到 {len(examples)} 个 Implementation Example:\n") + + # 遍历每个示例 + for idx, example in enumerate(examples, 1): + print(f"{'='*80}") + print(f"示例 {idx}:") + print(f"{'-'*40}") + print(f"表达式: {example}") + print(f"{'-'*40}") + + # 提取变量名 + variables = extract_variables_from_expression(example) + + if variables: + print(f"提取到的变量 ({len(variables)} 个):") + for var_idx, var in enumerate(variables, 1): + print(f" {var_idx}. {var}") + else: + print("未提取到变量名") + + print() # 空行分隔 + + print("=" * 80) + print(f"处理完成!共处理 {len(examples)} 个示例") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/check_llm_idea/llm_idea.md b/check_llm_idea/llm_idea.md new file mode 100644 index 0000000..50c26ba --- /dev/null +++ b/check_llm_idea/llm_idea.md @@ -0,0 +1,364 @@ +# biasfree_analyst Feature Engineering Analysis Report + +**Dataset**: biasfree_analyst +**Category**: Analyst +**Region**: USA +**Analysis Date**: 2026-04-09 +**Fields Analyzed**: 54 + +--- + +## Executive Summary + +**Primary Question Answered by Dataset**: How do analysts' bias-adjusted forecasts (price targets and fundamentals) vary across multiple "analogues" (bias removal methods), and what do these variations reveal about uncertainty, consensus strength, and potential mispricing? + +**Key Insights from Analysis**: +- This dataset is unique because it provides multiple "bias-free analogues" (first, second, third) for the same underlying metric, rather than just a single consensus or raw value. This allows us to measure the *stability* of the bias-adjustment process itself. +- The presence of standard deviation fields for each analogue group allows for direct measurement of *disagreement* among bias correction methodologies, which is a novel proxy for forecast ambiguity. +- Revision counts (upward/downward) provide a dynamic signal of how the "clean" view of analysts is changing, stripped of systematic optimism or pessimism. + +**Critical Field Relationships Identified**: +- The `_first_`, `_second_`, and `_third_` biasfree analogues represent different statistical approaches to removing bias. Comparing them reveals the sensitivity of the forecast to the choice of bias model. +- `mean_` vs `median_` fields within the same analogue group highlight the skewness of the distribution of bias-adjusted estimates. +- `stddev_` fields serve as direct measures of cross-analyst (or cross-model) uncertainty for the bias-free view. + +**Most Promising Feature Concepts**: +1. **Bias Adjustment Fragility (Dispersion of Analogues)** - because it quantifies how much the "true" forecast changes depending on the specific bias-correction technique used. +2. **Bias-Free Revision Momentum (Up-Down Ratio)** - because it isolates the directional change in analyst conviction *after* removing systematic biases. +3. **Bias-Free Target Dispersion Ratio (Uncertainty-Adjusted Upside)** - because it evaluates the risk-adjusted upside implied by bias-free price targets. + +--- + +## Dataset Deep Understanding + +### Dataset Description +This dataset contains bias-adjusted analyst estimates for price targets and fundamentals. Unlike standard consensus data, it provides multiple "bias-free analogues" (first, second, third) generated by different statistical models. It also includes distribution statistics (mean, median, stddev, min, max, count) and revision counts for these bias-free metrics. The goal is to provide a cleaner, less behaviorally skewed view of analyst expectations. + +### Field Inventory +| Field ID | Description | Data Type | Update Frequency | Coverage | +|----------|-------------|-----------|------------------|----------| +| `biasfree_analyst_price_target` | Single analyst's bias-adjusted price target | Float | Event-driven | Moderate | +| `biasfree_analyst_fundamental_estimate` | Single analyst's bias-adjusted fundamental | Float | Event-driven | Moderate | +| `mean_bias_adjusted_price_target` | Mean of bias-adjusted price target estimates | Float | Event-driven | High | +| `mean_bias_adjusted_fundamental_estimate` | Mean of bias-adjusted fundamental estimates | Float | Event-driven | High | +| `median_bias_adjusted_price_target` | Median of bias-adjusted price target estimates | Float | Event-driven | High | +| `stddev_bias_adjusted_price_target` | Standard deviation of bias-adjusted price targets | Float | Event-driven | High | +| `num_upward_biasfree_price_target_revisions` | Count of upward bias-free PT revisions | Integer | Event-driven | Moderate | +| `num_downward_biasfree_price_target_revisions` | Count of downward bias-free PT revisions | Integer | Event-driven | Moderate | +| `avg_first_biasfree_price_target_estimate` | Average of first bias-free PT analogue | Float | Event-driven | High | +| `avg_second_biasfree_price_target_estimate` | Average of second bias-free PT analogue | Float | Event-driven | High | +| `avg_third_biasfree_price_target_estimate` | Average of third bias-free PT analogue | Float | Event-driven | High | +| `forecast_horizon_months` | Time horizon in months for the estimate | Integer | Static | High | + +*(Note: Only representative fields shown for brevity; analysis encompasses all 54 fields.)* + +### Field Deconstruction Analysis + +#### biasfree_analyst_price_target: Bias-Adjusted Analyst Price Target +- **What is being measured?**: A single analyst's price target after removing statistical bias (e.g., over-optimism). +- **How is it measured?**: Raw analyst target processed through a bias-correction model. +- **Time dimension**: Point-in-time snapshot (Event). +- **Business context**: Raw analyst targets are notoriously optimistic; this field aims to provide a "truer" expectation of future price. +- **Generation logic**: Proprietary bias model applied to raw data. +- **Reliability considerations**: Depends heavily on the accuracy of the bias model. Missing values mean no estimate was made or the bias model couldn't be applied. + +#### avg_first_biasfree_price_target_estimate: First Bias-Free Analogue Mean +- **What is being measured?**: The consensus (mean) of analyst estimates after applying the *first* specific bias-correction methodology. +- **How is it measured?**: Average of all `biasfree_analyst_price_target` values generated using "Model 1". +- **Time dimension**: Point-in-time snapshot (Event). +- **Business context**: Represents the "clean" view of the street using one specific debiasing lens. +- **Generation logic**: Cross-sectional mean calculation. +- **Reliability considerations**: Outliers (single extreme analysts) can skew the mean. + +#### stddev_first_biasfree_price_target_estimate: Dispersion of First Analogue +- **What is being measured?**: The level of disagreement among analysts *after* applying the first bias-correction model. +- **How is it measured?**: Standard deviation of the `avg_first_biasfree_price_target_estimate` component inputs. +- **Time dimension**: Point-in-time snapshot (Event). +- **Business context**: High standard deviation indicates that even after removing common bias, analysts strongly disagree on valuation. +- **Generation logic**: Cross-sectional standard deviation. +- **Reliability considerations**: Requires a minimum number of estimates (count) to be statistically meaningful. + +#### num_upward_biasfree_price_target_revisions: Bias-Free Optimism Flow +- **What is being measured?**: The number of analysts who raised their *bias-adjusted* price target. +- **How is it measured?**: Count of events where current bias-adjusted target > previous bias-adjusted target. +- **Time dimension**: Cumulative over a period (Event count). +- **Business context**: Distinguishes between "analyst getting more bullish" and "analyst just being less biased." A rise here signals genuine improvement in the *debiased* outlook. +- **Generation logic**: Event tracking and comparison. +- **Reliability considerations**: Zeros can mean no revisions or no coverage. + +### Field Relationship Mapping + +**The Story This Data Tells**: +This data tells the story of *consensus fragility* and *true conviction*. It doesn't just ask "What is the forecast?" but "How much does that forecast depend on *how* we clean the data?" and "How confident are analysts in the cleaned data?" The multiple analogues (`first_`, `second_`, `third_`) allow us to see the variance in the output of the data cleaning pipeline itself. + +**Key Relationships Identified**: +1. **Analogue Convergence/Divergence**: The spread between `avg_first`, `avg_second`, and `avg_third` biasfree estimates indicates the sensitivity of the "fair value" to the statistical debiasing technique. A large spread implies the valuation is highly dependent on the model assumption (High Uncertainty). +2. **Cross-Analyst Disagreement**: The `stddev_` fields measure how much individual analysts disagree *even after* removing their collective biases. High StdDev = High Disagreement = High Risk. +3. **Directional Pressure**: The ratio of `num_upward` to `num_downward` revisions shows the vector of change in the *bias-free* consensus. This is a leading indicator of changes in "smart money" expectations. + +**Missing Pieces That Would Complete the Picture**: +- **The Specific Bias Models**: Knowing if "first" is a simple industry adjustment and "third" is a complex ML model would add context. +- **Historical Timestamps**: We have the fields, but knowing the exact date of each revision/release is crucial for backtesting (implied by `delay=1`, but field-level dates are opaque here). +- **Actual Reported Fundamentals**: To calculate the "surprise" of the bias-free estimate vs. reality. + +--- + +## Feature Concepts by Question Type + +### Q1: "What is stable?" (Invariance Features) + +**Concept**: Bias Adjustment Fragility Score +- **Sample Fields Used**: `avg_first_biasfree_price_target_estimate`, `avg_second_biasfree_price_target_estimate`, `avg_third_biasfree_price_target_estimate` +- **Definition**: The coefficient of variation across the three distinct bias-free price target analogues. Formula: `stddev(analogue1, analogue2, analogue3) / mean(analogue1, analogue2, analogue3)`. +- **Why This Feature**: It answers: "Is the fair value estimate robust to the choice of debiasing technique?" If the answer is no (high fragility), the stock's valuation is highly subjective and likely prone to larger price swings on news. +- **Logical Meaning**: Measures the model risk inherent in the analyst consensus. A fragile stock is one where quants cannot agree on what the "clean" number even is. +- **Is filling nan necessary**: Yes. If only one analogue exists, fragility is undefined. We should fill NaN with 0 (meaning no evidence of fragility) or use a neutral value. Better yet, mask the feature where `count_bias_adjusted_price_target_estimates` < 2. +- **Directionality**: High Value = High Fragility/Model Risk (Potentially bearish/risky). Low Value = Robust Consensus (Potentially safer/more reliable). +- **Boundary Conditions**: Extremely high values indicate the bias correction methods contradict each other violently (one says buy, one says sell). +- **Implementation Example**: `divide({stddev_analogues}, abs({mean_analogues}))` where the inputs are the three average fields. + +**Concept**: Fundamental Estimate Robustness Ratio +- **Sample Fields Used**: `median_bias_adjusted_fundamental_estimate`, `min_bias_adjusted_fundamental_estimate`, `max_bias_adjusted_fundamental_estimate` +- **Definition**: The ratio of the interquartile range or full range of bias-adjusted fundamental estimates relative to the median. Proxy: `(max_bias_adjusted_fundamental_estimate - min_bias_adjusted_fundamental_estimate) / abs(median_bias_adjusted_fundamental_estimate)`. +- **Why This Feature**: Similar to the above but for fundamentals (EPS, Sales). High range means analysts wildly disagree on the upcoming fundamental performance *even after debiasing*. +- **Logical Meaning**: Measures uncertainty about the company's near-term operational reality. +- **Is filling nan necessary**: Yes. Use `group_mean` backfill or 0 if range is undefined (only 1 estimate). +- **Directionality**: High Value = High Earnings Uncertainty. Low Value = High Earnings Visibility. +- **Boundary Conditions**: Infinite if median is 0. Cap at a reasonable threshold (e.g., 10). +- **Implementation Example**: `divide(subtract({max_bias_adjusted_fundamental_estimate}, {min_bias_adjusted_fundamental_estimate}), abs({median_bias_adjusted_fundamental_estimate}))` + +--- + +### Q2: "What is changing?" (Dynamics Features) + +**Concept**: Bias-Free Revision Momentum (PT) +- **Sample Fields Used**: `num_upward_biasfree_price_target_revisions`, `num_downward_biasfree_price_target_revisions` +- **Definition**: The net directional flow of bias-free price target changes. Formula: `(Up - Down) / (Up + Down + 1)`. +- **Why This Feature**: Raw revision ratios are often skewed by analyst optimism. Since this is *bias-free* revisions, a positive momentum signals genuine improvement in the clean data signal, not just behavioral bias. +- **Logical Meaning**: Net directional conviction of the bias-corrected analyst community. +- **Is filling nan necessary**: Yes. Use `ts_backfill` or `0` if no revisions. The `+1` in denominator prevents division by zero. +- **Directionality**: High Positive = Strong Bias-Free Upward Momentum (Bullish). High Negative = Strong Bias-Free Downward Momentum (Bearish). +- **Boundary Conditions**: Values near +1 or -1 indicate unanimous revision direction in the recent period. +- **Implementation Example**: `divide(subtract({num_upward_biasfree_price_target_revisions}, {num_downward_biasfree_price_target_revisions}), add({num_upward_biasfree_price_target_revisions}, {num_downward_biasfree_price_target_revisions}, 1))` + +**Concept**: Bias-Free Earnings Momentum Change +- **Sample Fields Used**: `num_upward_biasfree_fundamental_revisions`, `num_downward_biasfree_fundamental_revisions` +- **Definition**: The change in the Bias-Free Revision Momentum (calculated above) over a short window (e.g., 5 days). `momentum_today - momentum_5_days_ago`. +- **Why This Feature**: Captures the *acceleration* or *deceleration* of bias-free sentiment. A shift from negative to positive momentum is a powerful turnaround signal. +- **Logical Meaning**: The rate of change of clean analyst conviction. +- **Is filling nan necessary**: Yes. Use `ts_backfill` for missing historical momentum values. +- **Directionality**: Positive Change = Improving Bias-Free Outlook. Negative Change = Deteriorating Bias-Free Outlook. +- **Boundary Conditions**: Requires sufficient revision volume. Noisy on illiquid stocks. +- **Implementation Example**: `subtract({momentum}, ts_delay({momentum}, 5))` + +--- + +### Q3: "What is anomalous?" (Deviation Features) + +**Concept**: Bias-Free Consensus Divergence +- **Sample Fields Used**: `biasfree_analyst_price_target`, `median_bias_adjusted_price_target`, `stddev_bias_adjusted_price_target` +- **Definition**: The z-score of the current *median* bias-adjusted price target relative to its own 20-day history. `(median_pt - ts_mean(median_pt, 20)) / ts_std_dev(median_pt, 20)`. +- **Why This Feature**: Detects when the "clean" consensus view of fair value has moved significantly away from its recent range. This is a structural shift in how quants/modelers view the stock. +- **Logical Meaning**: A breakout or breakdown in the bias-free valuation framework. +- **Is filling nan necessary**: Yes. Backfill with `ts_backfill` for recent gaps. Mask if `ts_std_dev` is 0. +- **Directionality**: High Z-Score = Bias-Free Target has spiked up significantly (Bullish momentum). Low Z-Score = Bias-Free Target has crashed (Bearish momentum). +- **Boundary Conditions**: Extreme values (>3 or <-3) indicate a potential regime change or data error. +- **Implementation Example**: `divide(subtract({median_bias_adjusted_price_target}, ts_mean({median_bias_adjusted_price_target}, 20)), ts_std_dev({median_bias_adjusted_price_target}, 20))` + +**Concept**: Analyst Silent Treatment (Zero Revision Anomaly) +- **Sample Fields Used**: `num_upward_biasfree_price_target_revisions`, `num_downward_biasfree_price_target_revisions`, `count_bias_adjusted_price_target_estimates` +- **Definition**: A binary flag identifying stocks with high coverage (`count > 5`) but zero bias-free revisions (`up = 0 AND down = 0`) for a sustained period (e.g., 10 days). +- **Why This Feature**: If many analysts cover a stock but NO ONE is changing their bias-adjusted view, it signals extreme uncertainty or a "wait and see" mode preceding a major event (earnings, FDA approval). It's the calm before the storm. +- **Logical Meaning**: Information vacuum or gridlock in the professional forecasting community. +- **Is filling nan necessary**: No. We use logical operators to create a binary flag. NaN in counts/revisions should be treated as 0 (no data = no signal). +- **Directionality**: Flag = 1 indicates an anomaly (Potential for high volatility breakout). +- **Boundary Conditions**: Avoid flagging small caps with 1 or 2 analysts. +- **Implementation Example**: `and(greater({count_bias_adjusted_price_target_estimates}, 5), equal(add({num_upward_biasfree_price_target_revisions}, {num_downward_biasfree_price_target_revisions}), 0))` + +--- + +### Q4: "What is combined?" (Interaction Features) + +**Concept**: Uncertainty-Adjusted Price Target Upside +- **Sample Fields Used**: `median_bias_adjusted_price_target`, `stddev_bias_adjusted_price_target`, `close` (External Data) +- **Definition**: The implied return to the bias-free price target, penalized by the dispersion of those estimates. Formula: `(Target / Price - 1) / (1 + CoV_Target)`. +- **Why This Feature**: A stock with 20% upside but high disagreement among bias-adjusted models is riskier than a stock with 10% upside and tight agreement. This metric favors high-conviction, low-uncertainty opportunities. +- **Logical Meaning**: Risk-adjusted expected return based solely on the bias-free analyst view. +- **Is filling nan necessary**: Yes. Fill `stddev` with mean if missing, or mask. Fill `Price` with `ts_backfill`. +- **Directionality**: High Value = Attractive Risk/Reward based on clean analyst data. +- **Boundary Conditions**: Negative values mean target is below current price (Downside). +- **Implementation Example**: `divide(subtract(divide({median_bias_adjusted_price_target}, {price}), 1), add(1, divide({stddev_bias_adjusted_price_target}, abs({median_bias_adjusted_price_target}))))` + +**Concept**: Bias-Free Earnings Visibility Score +- **Sample Fields Used**: `median_bias_adjusted_fundamental_estimate`, `stddev_bias_adjusted_fundamental_estimate`, `count_bias_adjusted_fundamental_estimates` +- **Definition**: A composite score measuring the "cleanliness" and "strength" of the fundamental forecast. Formula: `Count / (1 + (StdDev / Median))`. +- **Why This Feature**: High count + Low dispersion = High visibility. Low count + High dispersion = Low visibility. This distills the quality of the earnings signal into one number. +- **Logical Meaning**: A measure of how reliable the bias-adjusted earnings forecast is. +- **Is filling nan necessary**: Yes. Cap denominator at some max. Treat missing `median` as 0. +- **Directionality**: High Value = High Visibility/Reliability. Low Value = Garbage In, Garbage Out. +- **Boundary Conditions**: Very high scores indicate "obvious" earnings stories (low alpha potential due to efficiency). Very low scores indicate "speculative" stories (high risk/reward). +- **Implementation Example**: `divide({count_bias_adjusted_fundamental_estimates}, add(1, divide({stddev_bias_adjusted_fundamental_estimate}, abs({median_bias_adjusted_fundamental_estimate}))))` + +--- + +### Q5: "What is structural?" (Composition Features) + +**Concept**: Model Dependency Ratio (First vs. Third Analogue) +- **Sample Fields Used**: `avg_first_biasfree_price_target_estimate`, `avg_third_biasfree_price_target_estimate` +- **Definition**: The ratio of the First Analogue Mean to the Third Analogue Mean. `First_Mean / Third_Mean`. +- **Why This Feature**: If the first analogue (presumably simpler) and third analogue (presumably complex/ML) diverge significantly, it indicates a stock whose valuation is highly sensitive to complex model specifications. This is a proxy for "Quant Complexity Risk." +- **Logical Meaning**: Measures how much the "fair value" estimate changes when using a sophisticated bias model vs. a basic one. +- **Is filling nan necessary**: Yes. Fill missing analogues with the median of the available ones. +- **Directionality**: Value >> 1.0 = Complex model values stock much lower (Model Risk). Value << 1.0 = Complex model values stock much higher (Model Speculation). +- **Boundary Conditions**: Values near 1.0 indicate model stability. +- **Implementation Example**: `divide({avg_first_biasfree_price_target_estimate}, {avg_third_biasfree_price_target_estimate})` + +**Concept**: Target Horizon Skew Indicator +- **Sample Fields Used**: `forecast_horizon_months`, `median_bias_adjusted_price_target` +- **Definition**: The ratio of the median price target to the current price, annualized by the forecast horizon. `(Target/Price)^(12/Horizon) - 1`. +- **Why This Feature**: Normalizes the price target return for time. A 20% return over 24 months is less impressive than a 15% return over 6 months. +- **Logical Meaning**: Annualized expected return derived from bias-free price targets. +- **Is filling nan necessary**: Yes. If `forecast_horizon_months` is missing, assume 12 months. +- **Directionality**: High Value = High annualized expected return. +- **Boundary Conditions**: Very short horizons (1 month) with extreme targets can produce unrealistic annualized figures. Cap at 1000%. +- **Implementation Example**: `subtract(power(divide({median_bias_adjusted_price_target}, {price}), divide(12, {forecast_horizon_months})), 1)` + +--- + +### Q6: "What is cumulative?" (Accumulation Features) + +**Concept**: Cumulative Bias-Free Revision Imbalance +- **Sample Fields Used**: `num_upward_biasfree_price_target_revisions`, `num_downward_biasfree_price_target_revisions` +- **Definition**: The cumulative sum of the net revision count (Up - Down) over a trailing 60-day window. `ts_sum(up - down, 60)`. +- **Why This Feature**: Smooths out the daily noise in revision counts to reveal the medium-term trend in bias-free sentiment. A consistently positive imbalance over 60 days is a strong bull signal. +- **Logical Meaning**: The accumulated pressure of bias-free analyst conviction. +- **Is filling nan necessary**: Yes. Treat NaN revisions as 0 in the sum. +- **Directionality**: High Positive = Sustained Bias-Free Optimism. High Negative = Sustained Bias-Free Pessimism. +- **Boundary Conditions**: Reversal patterns occur when cumulative sum peaks and rolls over. +- **Implementation Example**: `ts_sum(subtract({num_upward_biasfree_price_target_revisions}, {num_downward_biasfree_price_target_revisions}), 60)` + +**Concept**: Bias-Free Estimate Convergence Countdown +- **Sample Fields Used**: `stddev_bias_adjusted_fundamental_estimate`, `count_bias_adjusted_fundamental_estimates` +- **Definition**: A time decay feature that counts the number of days since the `stddev_bias_adjusted_fundamental_estimate` last widened significantly. +- **Why This Feature**: As earnings announcement approaches, uncertainty (StdDev) should drop as information is disseminated. If StdDev remains high *and* we are close to the announcement date, it signals a high-probability surprise event. +- **Logical Meaning**: Measures the failure of the market to resolve uncertainty before a known catalyst. +- **Is filling nan necessary**: Yes. Use `ts_backfill`. +- **Directionality**: High Days Count + High Current StdDev = Elevated Risk of Earnings Surprise. +- **Boundary Conditions**: Requires knowledge of earnings calendar (external data) for best accuracy. +- **Implementation Example**: `days_from_last_change({stddev_bias_adjusted_fundamental_estimate})` + +--- + +### Q7: "What is relative?" (Comparison Features) + +**Concept**: Bias-Free Target vs. Sector Median +- **Sample Fields Used**: `median_bias_adjusted_price_target` +- **Definition**: The cross-sectional rank of the bias-free price target upside within its sector (requires external sector mapping). `group_rank(upside, sector)`. +- **Why This Feature**: A high bias-free target is only meaningful if it's higher than peers. This identifies stocks where the *clean data* suggests relative outperformance within a sector. +- **Logical Meaning**: Relative attractiveness of the bias-free valuation. +- **Is filling nan necessary**: Yes. Use `group_neutralize` or `group_rank`. +- **Directionality**: High Rank (0.8-1.0) = Top relative bias-free upside. Low Rank (0.0-0.2) = Bottom relative bias-free upside. +- **Boundary Conditions**: Sectors with few stocks will have noisy ranks. +- **Implementation Example**: `group_rank(divide({median_bias_adjusted_price_target}, {price}), {sector})` + +**Concept**: Bias-Free Fundamental vs. Historical Actual +- **Sample Fields Used**: `median_bias_adjusted_fundamental_estimate`, `eps_actual_ttm` (External Data) +- **Definition**: The ratio of the bias-free fundamental estimate to the trailing twelve-month actual fundamental. `Estimate / Actual`. +- **Why This Feature**: Shows the expected growth/decline in fundamentals, stripped of analyst bias. A high ratio suggests strong expected operational growth. +- **Logical Meaning**: Bias-adjusted expected growth rate. +- **Is filling nan necessary**: Yes. Backfill actuals. +- **Directionality**: High Value = High Expected Fundamental Growth. +- **Boundary Conditions**: Extreme values may indicate one-time items or data errors in the "Actual" field. +- **Implementation Example**: `divide({median_bias_adjusted_fundamental_estimate}, {eps_actual_ttm})` + +--- + +### Q8: "What is essential?" (Essence Features) + +**Concept**: Bias-Free Alpha Signal Strength +- **Sample Fields Used**: `mean_bias_adjusted_price_target`, `stddev_bias_adjusted_price_target`, `num_upward_biasfree_price_target_revisions`, `num_downward_biasfree_price_target_revisions` +- **Definition**: A composite z-score of the three core components of this dataset: (1) Implied Upside, (2) Estimate Dispersion, (3) Revision Momentum. Combined into a single score. +- **Why This Feature**: This distills the entire dataset into one clean alpha signal. It answers: "Based on *all* the bias-free data, how bullish or bearish is the clean signal?" +- **Logical Meaning**: The holistic, model-free (ironically) summary of the bias-free analyst view. +- **Is filling nan necessary**: Yes. Each component z-score should be normalized cross-sectionally. +- **Directionality**: High Positive = Strong Bias-Free Bullish Signal. High Negative = Strong Bias-Free Bearish Signal. +- **Boundary Conditions**: This is the core trading signal derived from the dataset. +- **Implementation Example**: `zscore({upside}) - zscore({dispersion}) + zscore({momentum})` + +**Concept**: Bias-Free Data Quality Flag +- **Sample Fields Used**: `count_bias_adjusted_price_target_estimates`, `count_bias_adjusted_fundamental_estimates` +- **Definition**: A binary mask: 1 if `count_pt >= 3 AND count_fund >= 3`, else 0. +- **Why This Feature**: All derived features from this dataset are statistically meaningless if the underlying sample size is too small. This flag ensures we only trade on robust data. +- **Logical Meaning**: Minimum Viable Data Threshold for Bias-Free Analysis. +- **Is filling nan necessary**: No. Treat NaN counts as 0. +- **Directionality**: 1 = Reliable Data. 0 = Unreliable Data. +- **Boundary Conditions**: This should be used as a filter `trade_when` condition. +- **Implementation Example**: `and(greater_equal({count_bias_adjusted_price_target_estimates}, 3), greater_equal({count_bias_adjusted_fundamental_estimates}, 3))` + +--- + +## Implementation Considerations + +### Data Quality Notes +- **Coverage**: Moderate to High for TOP200 universe. Smaller cap stocks may have sparse or missing analyst coverage. +- **Timeliness**: Event-driven. Data updates when analysts publish or revise estimates. There can be gaps of weeks with no new data. +- **Accuracy**: Depends on the proprietary bias-correction models used by the data vendor. The "truth" of the bias correction is unobservable. +- **Potential Biases**: Survivorship bias (analysts drop coverage of failing companies). Model bias (the bias-correction models themselves may have systematic errors). + +### Computational Complexity +- **Lightweight features**: Ratio calculations, logical flags, simple differences. +- **Medium complexity**: Rolling Z-scores (`ts_zscore`), cumulative sums (`ts_sum`). +- **Heavy computation**: Cross-sectional group ranks and neutralizations (`group_rank`, `group_neutralize`). + +### Recommended Prioritization + +**Tier 1 (Immediate Implementation)**: +1. **Bias Adjustment Fragility Score** - Unique differentiator of this dataset. +2. **Bias-Free Revision Momentum (PT)** - Direct, clean alpha signal. +3. **Bias-Free Data Quality Flag** - Essential filter for all other features. + +**Tier 2 (Secondary Priority)**: +1. **Uncertainty-Adjusted Price Target Upside** - Combines signal with risk. +2. **Fundamental Estimate Robustness Ratio** - Checks earnings visibility. + +**Tier 3 (Requires Further Validation)**: +1. **Analyst Silent Treatment** - Interesting anomaly but needs backtest validation. + +--- + +## Critical Questions for Further Exploration + +### Unanswered Questions: +1. What is the exact statistical difference between the `first`, `second`, and `third` bias-free analogues? (e.g., Linear Regression vs. Neural Net vs. Bayesian). +2. What is the average decay rate of a bias-free revision? Does it predict returns for 5 days or 50 days? +3. Are there specific sectors where bias-free data is most predictive (e.g., Tech) and others where it fails (e.g., Utilities)? + +### Recommended Additional Data: +- **Sector/Industry Classification**: Required for cross-sectional relative value features. +- **Actual Earnings Announcement Dates**: To align estimates with reality and measure "Bias-Free Surprise." +- **Historical Stock Prices**: Required for all return/upside calculations. + +### Assumptions to Challenge: +- **Assumption**: "Bias-free" means "Better." We should challenge if removing bias removes a *predictive* signal (e.g., some biases are self-fulfilling prophecies). +- **Assumption**: All analogues are equally valid. The market may favor one bias-correction method over another. + +--- + +## Methodology Notes + +**Analysis Approach**: This report was generated by: +1. Deep field deconstruction to understand data essence (Multiple bias-correction analogues). +2. Question-driven feature generation (8 fundamental questions). +3. Logical validation of each feature concept. +4. Transparent documentation of reasoning. + +**Design Principles**: +- Focus on logical meaning over conventional patterns. +- Every feature must answer a specific question. +- Clear documentation of "why" for each suggestion. +- Emphasis on data understanding over prediction. + +--- +*Report generated: 2026-04-09* +*Analysis depth: Comprehensive field deconstruction + 8-question framework* +*Next steps: Implement Tier 1 features, validate assumptions, gather additional data as needed* \ No newline at end of file diff --git a/check_llm_idea/prompt.md b/check_llm_idea/prompt.md new file mode 100644 index 0000000..6a2571a --- /dev/null +++ b/check_llm_idea/prompt.md @@ -0,0 +1,1479 @@ +# System Prompt + +You are executing two skills in sequence: +1) brain-data-feature-engineering +2) brain-feature-implementation +The following SKILL.md documents are authoritative; follow them exactly. + +--- SKILL.md (brain-data-feature-engineering) --- +--- +brain-data-feature-engineering methodology +--- + +# BRAIN Data Feature Engineering Workflow + +**Purpose**: Automatically transform BRAIN dataset fields into deep, meaningful feature engineering ideas. + +## Input Requirements + +### Required Parameters: +- **data_category**: Dataset category (e.g., "fundamental", "analyst", "news", "model") +- **delay**: Data delay setting (0 or 1) +- **region**: Market region (e.g., "USA", "EUR", "ASI") + +### Optional Parameters: +- **universe**: Trading universe (default: "TOP3000") +- **dataset_id**: Specific dataset ID (if known, skips discovery phase) + +## Workflow Overview + +### Step 2: Field Extraction and Deconstruction +- **Deconstruct each field's meaning**: + * What is being measured? (the entity/concept) + * How is it measured? (collection/calculation method) + * Time dimension? (instantaneous, cumulative, rate of change) + * Business context? (why does this field exist?) + * Generation logic? (reliability considerations) +- **Build field profiles**: Structured understanding of each field's essence + +### Step 3: Reasoning and Analysis +**Performs deep analysis based on collected information:** + +#### A. Field Relationship Mapping +- Analyze logical connections between fields +- Identify: independent fields, related fields, complementary fields +- Map the "story" the dataset tells +- **Key question**: What relationships are implied by these fields? + +#### B. Attention-Driven Mispricing Framework (Internal Process) + +The skill asks itself these questions and generates feature concepts: + +1. **"What grabs investor attention?"** → Attention triggers + - Abnormal trading volume spikes + - Extreme daily return events + - News coverage intensity surges + +2. **"What escapes attention scrutiny?"** → Neglected assets + - Low media coverage stocks + - Complex name or industry classifications + - Non-benchmark index constituents + +3. **"Who faces attention constraints?"** → Investor types + - Retail trading concentration ratios + - Institutional portfolio complexity levels + - Analyst coverage scarcity degrees + +4. **"What creates buying pressure?"** → Demand imbalance + - Unidirectional retail order flow + - Short-sale constraints tightness + - Option market speculation spikes + +5. **"What delays price correction?"** → Arbitrage limits + - High idiosyncratic volatility levels + - Securities borrowing fee spikes + - Market maker inventory capacity + +6. **"When does attention fade?"** → Decay patterns + - Post-event volume normalization speed + - News cycle half-life duration + - Earnings announcement proximity + +7. **"What is relatively ignored?"** → Cross-sectional gaps + - Attention ranking differentials + - Sectoral attention dispersion metrics + - Market cap coverage ratios + +8. **"What price distortion remains?"** → Fundamental deviation + - Valuation multiple inflation degree + - Future earnings surprise predictability + - Long-term reversion magnitude potential + +#### C. Feature Concept Generation +For each relevant question-field combination: +- Formulate feature concept that answers the question +- Define the concept clearly +- Identify the logical meaning +- Consider directionality (what high/low values mean) +- Identify boundary conditions +- Note potential issues/limitations + +### Step 4: Feature Documentation +**For each generated feature concept, document:** +- **Concept Name**: Clear, descriptive name +- **Definition**: One-sentence definition +- **Logical Meaning**: What phenomenon/concept does it represent? +- **Why It's Meaningful**: Why does this feature make sense? +- **Directionality**: Interpretation of high vs. low values +- **Boundary Conditions**: What extremes indicate +- **Data Requirements**: What fields are used and any constraints +- **Potential Issues**: Known limitations or concerns + +### Step 5: Output Generation +**Generate structured markdown report including:** + +0. **Output the report markdown format** in the following format: + + # {dataset_name} Feature Engineering Analysis Report + + **Dataset**: {dataset_id} + **Category**: {category} + **Region**: {region} + **Analysis Date**: {analysis_date} + **Fields Analyzed**: {field_count} + + --- + + ## Executive Summary + + **Primary Question Answered by Dataset**: What does this dataset fundamentally measure? + + **Key Insights from Analysis**: + - {insight_1} + - {insight_2} + - {insight_3} + + **Critical Field Relationships Identified**: + - {relationship_1} + - {relationship_2} + + **Most Promising Feature Concepts**: + 1. {top_feature_1} - because {reason_1} + 2. {top_feature_2} - because {reason_2} + 3. {top_feature_3} - because {reason_3} + + --- + + ## Dataset Deep Understanding + + ### Dataset Description + {dataset_description} + + ### Field Inventory + | Field ID | Description | Data Type | Update Frequency | Coverage | + |----------|-------------|-----------|------------------|----------| + | {field_1_id} | {field_1_desc} | {type_1} | {freq_1} | {coverage_1}% | + | {field_2_id} | {field_2_desc} | {type_2} | {freq_2} | {coverage_2}% | + | {field_3_id} | {field_3_desc} | {type_3} | {freq_3} | {coverage_3}% | + + *(Additional fields as needed)* + + ### Field Deconstruction Analysis + + #### {field_1_id}: {field_1_name} + - **What is being measured?**: {measurement_object_1} + - **How is it measured?**: {measurement_method_1} + - **Time dimension**: {time_dimension_1} + - **Business context**: {business_context_1} + - **Generation logic**: {generation_logic_1} + - **Reliability considerations**: {reliability_1} + + #### {field_2_id}: {field_2_name} + - **What is being measured?**: {measurement_object_2} + - **How is it measured?**: {measurement_method_2} + - **Time dimension**: {time_dimension_2} + - **Business context**: {business_context_2} + - **Generation logic**: {generation_logic_2} + - **Reliability considerations**: {reliability_2} + + *(Additional fields as needed)* + + ### Field Relationship Mapping + + **The Story This Data Tells**: + {story_description} + + **Key Relationships Identified**: + 1. {relationship_1_desc} + 2. {relationship_2_desc} + 3. {relationship_3_desc} + + **Missing Pieces That Would Complete the Picture**: + - {missing_1} + - {missing_2} + + --- + + ## Feature Concepts by Question Type + + + ### Q1: "What is stable?" (Invariance Features) + + **Concept**: {stability_feature_1_name} + - **Sample Fields Used**: fields_used_1 + - **Definition**: {definition_1} + - **Why This Feature**: {why_1} + - **Logical Meaning**: {logical_meaning_1} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. + - **Directionality**: {directionality_1} + - **Boundary Conditions**: {boundaries_1} + - **Implementation Example**: `{implementation_1}` + + **Concept**: {stability_feature_2_name} + - **Sample Fields Used**: fields_used_2 + - **Definition**: {definition_2} + - **Why This Feature**: {why_2} + - **Logical Meaning**: {logical_meaning_2} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_2} + - **Boundary Conditions**: {boundaries_2} + - **Implementation Example**: `{implementation_2}` + + --- + + ### Q2: "What is changing?" (Dynamics Features) + + **Concept**: {dynamics_feature_1_name} + - **Sample Fields Used**: fields_used_3 + - **Definition**: {definition_3} + - **Why This Feature**: {why_3} + - **Logical Meaning**: {logical_meaning_3} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_3} + - **Boundary Conditions**: {boundaries_3} + - **Implementation Example**: `{implementation_3}` + + **Concept**: {dynamics_feature_2_name} + - **Sample Fields Used**: fields_used_4 + - **Definition**: {definition_4} + - **Why This Feature**: {why_4} + - **Logical Meaning**: {logical_meaning_4} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_4} + - **Boundary Conditions**: {boundaries_4} + - **Implementation Example**: `{implementation_4}` + + --- + + ### Q3: "What is anomalous?" (Deviation Features) + + **Concept**: {anomaly_feature_1_name} + - **Sample Fields Used**: fields_used_5} + - **Definition**: {definition_5} + - **Why This Feature**: {why_5} + - **Logical Meaning**: {logical_meaning_5} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_5} + - **Boundary Conditions**: {boundaries_5} + - **Implementation Example**: `{implementation_5}` + + **Concept**: {anomaly_feature_2_name} + - **Sample Fields Used**: fields_used_6} + - **Definition**: {definition_6} + - **Why This Feature**: {why_6} + - **Logical Meaning**: {logical_meaning_6} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_6} + - **Boundary Conditions**: {boundaries_6} + - **Implementation Example**: `{implementation_6}` + + --- + + ### Q4: "What is combined?" (Interaction Features) + + **Concept**: {interaction_feature_1_name} + - **Sample Fields Used**: fields_used_7} + - **Definition**: {definition_7} + - **Why This Feature**: {why_7} + - **Logical Meaning**: {logical_meaning_7} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_7} + - **Boundary Conditions**: {boundaries_7} + - **Implementation Example**: `{implementation_7}` + + **Concept**: {interaction_feature_2_name} + - **Sample Fields Used**: fields_used_8} + - **Definition**: {definition_8} + - **Why This Feature**: {why_8} + - **Logical Meaning**: {logical_meaning_8} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_8} + - **Boundary Conditions**: {boundaries_8} + - **Implementation Example**: `{implementation_8}` + + --- + + ### Q5: "What is structural?" (Composition Features) + + **Concept**: {structure_feature_1_name} + - **Sample Fields Used**: fields_used_9} + - **Definition**: {definition_9} + - **Why This Feature**: {why_9} + - **Logical Meaning**: {logical_meaning_9} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_9} + - **Boundary Conditions**: {boundaries_9} + - **Implementation Example**: `{implementation_9}` + + **Concept**: {structure_feature_2_name} + - **Sample Fields Used**: fields_used_10} + - **Definition**: {definition_10} + - **Why This Feature**: {why_10} + - **Logical Meaning**: {logical_meaning_10} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_10} + - **Boundary Conditions**: {boundaries_10} + - **Implementation Example**: `{implementation_10}` + + --- + + ### Q6: "What is cumulative?" (Accumulation Features) + + **Concept**: {accumulation_feature_1_name} + - **Sample Fields Used**: fields_used_11} + - **Definition**: {definition_11} + - **Why This Feature**: {why_11} + - **Logical Meaning**: {logical_meaning_11} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_11} + - **Boundary Conditions**: {boundaries_11} + - **Implementation Example**: `{implementation_11}` + + **Concept**: {accumulation_feature_2_name} + - **Sample Fields Used**: fields_used_12} + - **Definition**: {definition_12} + - **Why This Feature**: {why_12} + - **Logical Meaning**: {logical_meaning_12} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_12} + - **Boundary Conditions**: {boundaries_12} + - **Implementation Example**: `{implementation_12}` + + --- + + ### Q7: "What is relative?" (Comparison Features) + + **Concept**: {relative_feature_1_name} + - **Sample Fields Used**: fields_used_13} + - **Definition**: {definition_13} + - **Why This Feature**: {why_13} + - **Logical Meaning**: {logical_meaning_13} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_13} + - **Boundary Conditions**: {boundaries_13} + - **Implementation Example**: `{implementation_13}` + + **Concept**: {relative_feature_2_name} + - **Sample Fields Used**: fields_used_14} + - **Definition**: {definition_14} + - **Why This Feature**: {why_14} + - **Logical Meaning**: {logical_meaning_14} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_14} + - **Boundary Conditions**: {boundaries_14} + - **Implementation Example**: `{implementation_14}` + + --- + + ### Q8: "What is essential?" (Essence Features) + + **Concept**: {essence_feature_1_name} + - **Sample Fields Used**: fields_used_15} + - **Definition**: {definition_15} + - **Why This Feature**: {why_15} + - **Logical Meaning**: {logical_meaning_15} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_15} + - **Boundary Conditions**: {boundaries_15} + - **Implementation Example**: `{implementation_15}` + + **Concept**: {essence_feature_2_name} + - **Sample Fields Used**: fields_used_16} + - **Definition**: {definition_16} + - **Why This Feature**: {why_16} + - **Logical Meaning**: {logical_meaning_16} + - **Is filling nan necessary**: We have some operators to fill nan value like ts_backfill() or group_mean() etc. however, in some cases, if the nan value itself has some meaning, then we should not fill it blindly since it may introduce some bias. so before filling nan value, we should think about whether the nan value has some meaning in the specific scenario. If yes, do use appropriate method to fill nan value in the following implementation example. + - **Directionality**: {directionality_16} + - **Boundary Conditions**: {boundaries_16} + - **Implementation Example**: `{implementation_16}` + + --- + + ## Implementation Considerations + + ### Data Quality Notes + - **Coverage**: {coverage_note} + - **Timeliness**: {timeliness_note} + - **Accuracy**: {accuracy_note} + - **Potential Biases**: {bias_note} + + ### Computational Complexity + - **Lightweight features**: {simple_features} + - **Medium complexity**: {medium_features} + - **Heavy computation**: {complex_features} + + ### Recommended Prioritization + + **Tier 1 (Immediate Implementation)**: + 1. {priority_1_feature} - {priority_1_reason} + 2. {priority_2_feature} - {priority_2_reason} + 3. {priority_3_feature} - {priority_3_reason} + + **Tier 2 (Secondary Priority)**: + 1. {priority_4_feature} - {priority_4_reason} + 2. {priority_5_feature} - {priority_5_reason} + + **Tier 3 (Requires Further Validation)**: + 1. {priority_6_feature} - {priority_6_reason} + + --- + + ## Critical Questions for Further Exploration + + ### Unanswered Questions: + 1. {unanswered_question_1} + 2. {unanswered_question_2} + 3. {unanswered_question_3} + + ### Recommended Additional Data: + - {additional_data_1} + - {additional_data_2} + - {additional_data_3} + + ### Assumptions to Challenge: + - {assumption_1} + - {assumption_2} + - {assumption_3} + + --- + + ## Methodology Notes + + **Analysis Approach**: This report was generated by: + 1. Deep field deconstruction to understand data essence + 2. Question-driven feature generation (8 fundamental questions) + 3. Logical validation of each feature concept + 4. Transparent documentation of reasoning + + **Design Principles**: + - Focus on logical meaning over conventional patterns + - Every feature must answer a specific question + - Clear documentation of "why" for each suggestion + - Emphasis on data understanding over prediction + + --- + + *Report generated: {generation_timestamp}* + *Analysis depth: Comprehensive field deconstruction + 8-question framework* + *Next steps: Implement Tier 1 features, validate assumptions, gather additional data as needed* + + + +## Core Analysis Principles + +1. **From Data Essence**: Start with what data truly means, not what it's traditionally used for +2. **Autonomous Reasoning**: Skill performs all thinking, no user input required +3. **Question-Driven**: Internal question bank guides feature generation +4. **Meaning Over Patterns**: Prioritize logical meaning over conventional combinations +5. **Transparency**: Show reasoning process in output + +## Example Output Structure + +When analyzing dataset 'BEME' (Balance Sheet and Market Data), the output would include: + +### Dataset Understanding +**Fields Analyzed**: book_value, market_cap, book_to_market, etc. +**Key Observations**: Dataset compares accounting values with market valuations + +### Field Deconstruction +- **book_value**: Accountant's calculation of net asset value (quarterly, audited, historical cost-based) +- **market_cap**: Market participants' valuation (continuous, forward-looking, sentiment-influenced) +- **book_to_market**: Ratio comparing these two valuation perspectives + +### Feature Concepts Generated + +**From "What is stable?"** +- "Market reevaluation stability": Rolling coefficient of variation of book_to_market +- **Logic**: Measures whether market opinion is stable or volatile +- **Meaning**: Stable values suggest consensus, volatile values suggest disagreement/uncertainty + +**From "What is changing?"** +- "Value creation vs. market reevaluation decomposition": Separate book_value growth from market_cap growth +- **Logic**: Distinguish fundamental value creation from market sentiment changes +- **Meaning**: Which component drives changes in book_to_market? + +**From "What is combined?"** +- "Intangible value proportion": (market_cap - book_value) / enterprise_value +- **Logic**: Quantify proportion of value from intangibles (brand, growth, etc.) +- **Meaning**: What percentage of valuation isn't captured on the balance sheet? + +**(Additional question-based features would follow...)** + +## Implementation Notes + +### The skill should: +1. **Analyze first, then generate**: Fully understand dataset before proposing features +2. **Show reasoning**: Explain why each feature concept makes sense +3. **Be specific**: Reference actual field names and their characteristics +4. **Be critical**: Question assumptions and identify limitations +5. **Be creative**: Look beyond traditional financial metrics + +### The skill should NOT: +1. **Ask users to think**: All thinking is internal to the skill +2. **Provide generic templates**: Each analysis should be specific to the dataset +3. **Rely on conventional wisdom**: Challenge traditional approaches +4. **Output patterns without meaning**: Every suggestion must have clear logic + +## Quality Assurance + +**Self-Check Process:** +- [ ] All fields analyzed, not just skimmed +- [ ] Field meanings understood beyond descriptions +- [ ] Multiple question types explored +- [ ] Each feature has clear logical meaning +- [ ] Reasoning is explicit, not implicit +- [ ] Limitations are acknowledged +- [ ] Output is dataset-specific, not generic + +**Validation Questions:** +- Would this analysis help someone truly understand the data? +- Are feature concepts novel yet meaningful? +- Is the reasoning process transparent? +- Does it avoid conventional thinking traps? + +--- + +*This skill performs deep analysis of BRAIN datasets, generating meaningful feature engineering concepts based on data essence and logical reasoning.* + +--- SKILL.md (brain-feature-implementation) --- +--- +name: brain-feature-implementation +description: Automate conversion of Brain idea documents into actionable Alpha expressions using local CSV data. +--- + +# Brain Feature Implementation + +## Description +This skill automates the process of converting a WorldQuant Brain idea document (Markdown) into actionable Alpha expressions. + +## Instructions + +1. **Analyze the Idea Document** + * Read the provided markdown file. + * Extract the following metadata: + * **Dataset ID** (e.g., `analyst15`) + * **Region** (e.g., `GLB`) + * **Delay** (e.g., `1` or `0`) + * *If any metadata is missing, ask the user to clarify.* + +2. **Plan Implementation** + * Scan the markdown file for **Feature Definitions** or **Formulas**. + * Look for patterns like `Definition: ` or code blocks describing math. + * Use the `manage_todo_list` tool to create a plan with one entry for each unique idea/formula found. + * *Title*: The Idea Name or ID (e.g., "3.1.1 Estimate Stability Score"). + * *Description*: The specific template formula (e.g., `template: "{st_dev} / abs({mean})"`). + +3. **Execute Implementation** + * For each item in the Todo List: + * **Construct the Template**: + * Use Python format string syntax `{variable}`. + * The `{variable}` must be the **exact suffix** of the fields in the dataset as listed in the fields input. + * **CRITICAL**: Do NOT include the dataset prefix (e.g., `anl14_`) or horizon in the template. The script auto-detects these. + * **Time Window Handling**: For datasets with multiple time horizons (e.g., `_fy1`, `_fy2`, `_fp1`, `_fp2`), you MUST specify the time window in the variable. Use the full suffix as it appears in the field ID after removing the dataset prefix. + * *Correct Example*: For field `anl14_mean_roe_fy1`, use template: `{mean_roe_fy1}`. + * *Incorrect Example*: `{mean_roe}` (missing time window), `{anl14_mean_roe_fy1}` (includes prefix). + * *Note*: The script ONLY accepts `--template` and `--dataset`. Do not pass any other arguments like `--filters` or `--groupby`. + * Verify the output (number of expressions generated). + * Mark the Todo item as completed. + +------ +"allowed_operators": [ + { + "name": "add", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Adds two or more inputs element wise. Set filter=true to treat NaNs as 0 before summing.", + "definition": "add(x, y, filter = false), x + y" + }, + { + "name": "abs", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Returns the absolute value of a number, removing any negative sign.", + "definition": "abs(x)" + }, + { + "name": "log", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Calculates the natural logarithm of the input value. Commonly used to transform data that has positive values.", + "definition": "log(x)" + }, + { + "name": "subtract", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Subtracts inputs left to right: x ? y ? … Supports two or more inputs. Set filter=true to treat NaNs as 0 before subtraction.", + "definition": "subtract(x, y, filter=false), x - y" + }, + { + "name": "signed_power", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "x raised to the power of y such that final result preserves sign of x", + "definition": "signed_power(x, y)" + }, + { + "name": "sign", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Returns the sign of a number: +1 for positive, -1 for negative, and 0 for zero. If the input is NaN, returns NaN.\r\n\r\nInput: Value of 7 instruments at day t: (2, -3, 5, 6, 3, NaN, -10)\r\nOutput: (1, -1, 1, 1, 1, NaN, -1)", + "definition": "sign(x)" + }, + { + "name": "reverse", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": " - x", + "definition": "reverse(x)" + }, + { + "name": "power", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "x ^ y", + "definition": "power(x, y)" + }, + { + "name": "multiply", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Multiplies two or more inputs element wise. Set filter=true to treat NaNs as 0 before multiplication", + "definition": "multiply(x ,y, ... , filter=false), x * y" + }, + { + "name": "min", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Minimum value of all inputs. At least 2 inputs are required", + "definition": "min(x, y ..)" + }, + { + "name": "max", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Maximum value of all inputs. At least 2 inputs are required", + "definition": "max(x, y, ..)" + }, + { + "name": "inverse", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "1 / x", + "definition": "inverse(x)" + }, + { + "name": "sqrt", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Returns the non negative square root of x. Equivalent to power(x, 0.5); for signed roots use signed_power(x, 0.5).", + "definition": "sqrt(x)" + }, + { + "name": "s_log_1p", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Confine function to a shorter range using logarithm such that higher input remains higher and negative input remains negative as an output of resulting function and -1 or 1 is an asymptotic value", + "definition": "s_log_1p(x)" + }, + { + "name": "densify", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "Converts a grouping field of many buckets into lesser number of only available buckets so as to make working with grouping fields computationally efficient", + "definition": "densify(x)" + }, + { + "name": "divide", + "category": "Arithmetic", + "scope": "['REGULAR']", + "description": "x / y", + "definition": "divide(x, y), x / y" + }, + { + "name": "not", + "category": "Logical", + "scope": "['REGULAR']", + "description": "Returns the logical negation of x. Returns 0 when x is 1 (‘true’) and 1 when x is 0 (‘false’).", + "definition": "not(x)" + }, + { + "name": "and", + "category": "Logical", + "scope": "['REGULAR']", + "description": "Returns 1 ('true') if both inputs are 1 ('true'). Otherwise, returns 0 ('false').", + "definition": "and(input1, input2)" + }, + { + "name": "less", + "category": "Logical", + "scope": "['REGULAR']", + "description": "Returns 1 ('true') if input1 is a smaller than input2. Otherwise, returns 0 ('false').", + "definition": "input1 < input2" + }, + { + "name": "equal", + "category": "Logical", + "scope": "['REGULAR']", + "description": "Returns 1 ('true') if input1 and input2 are the same. Otherwise, returns 0 ('false').", + "definition": "input1 == input2" + }, + { + "name": "or", + "category": "Logical", + "scope": "['REGULAR']", + "description": "Returns 1 if either input is true (either input1 or input2 has a value of 1), otherwise it returns 0.", + "definition": "or(input1, input2)" + }, + { + "name": "not_equal", + "category": "Logical", + "scope": "['REGULAR']", + "description": "Returns 1 ('true') if input1 and input2 are different numbers. Otherwise, returns 0 ('false').", + "definition": "input1!= input2" + }, + { + "name": "greater", + "category": "Logical", + "scope": "['REGULAR']", + "description": "Returns 1 ('true') if input1 is a larger than input2. Otherwise, returns 0 ('false').", + "definition": "input1 > input2" + }, + { + "name": "greater_equal", + "category": "Logical", + "scope": "['REGULAR']", + "description": "Returns 1 ('true') if input1 is a larger or the same as input2. Otherwise, returns 0 ('false').", + "definition": "input1 >= input2" + }, + { + "name": "less_equal", + "category": "Logical", + "scope": "['REGULAR']", + "description": "Returns 1 ('true') if input1 is a smaller or the same as input2. Otherwise, returns 0 ('false').", + "definition": "input1 <= input2" + }, + { + "name": "is_nan", + "category": "Logical", + "scope": "['REGULAR']", + "description": "If (input == NaN) return 1 else return 0", + "definition": "is_nan(input)" + }, + { + "name": "if_else", + "category": "Logical", + "scope": "['REGULAR']", + "description": "The if_else operator returns one of two values based on a condition. If the condition is true, it returns the first value; if false, it returns the second value.", + "definition": "if_else(input1, input2, input 3)" + }, + { + "name": "ts_sum", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Sum values of x for the past d days.", + "definition": "ts_sum(x, d)" + }, + { + "name": "ts_scale", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Scales a time series to a 0–1 range based on its minimum and maximum values over a specified period, with an optional constant shift.", + "definition": "ts_scale(x, d, constant = 0)" + }, + { + "name": "ts_mean", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Calculates the simple average (mean) value of a variable x over the past d days.", + "definition": "ts_mean(x, d)" + }, + { + "name": "ts_zscore", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Calculates the Z-score of a time series, showing how far today's value is from the recent average, measured in standard deviations. Useful for standardizing and comparing values over time.", + "definition": "ts_zscore(x, d)" + }, + { + "name": "ts_std_dev", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Calculates the standard deviation of a data series x over the past d days, measuring how much the values deviate from their mean during that period.", + "definition": "ts_std_dev(x, d)" + }, + { + "name": "kth_element", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns the K-th value from a time series by looking back over a specified number of (‘d’) days, with the option to ignore certain values. Commonly used for backfilling missing data.", + "definition": "kth_element(x, d, k, ignore=“NaN”)" + }, + { + "name": "inst_tvr", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Total trading value / Total holding value in the past d days\r\n\r\nInput: Value of 1 instrument in past 5 days where first element is the latest: (105, 102, 99, 101,100)\r\nOutput: 0.022 from (1+2+3+3)/(105+102+99+101)", + "definition": "inst_tvr(x, d)" + }, + { + "name": "ts_corr", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Calculates the Pearson correlation between two variables, x and y, over the past d days, showing how closely they move together.", + "definition": "ts_corr(x, y, d)" + }, + { + "name": "ts_count_nans", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Counts the number of missing (NaN) values in a data series over a specified number of days.", + "definition": "ts_count_nans(x ,d)" + }, + { + "name": "ts_target_tvr_decay", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Tune \"ts_decay\" to have a turnover equal to a certain target, with optimization weight range between lambda_min, lambda_max", + "definition": "ts_target_tvr_decay(x, lambda_min=0, lambda_max=1, target_tvr=0.1)" + }, + { + "name": "ts_median", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns median value of x for the past d days", + "definition": "ts_median(x, d)" + }, + { + "name": "ts_covariance", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Calculates the covariance between two time-series variables, y and x, over the past d days. Useful for measuring how two variables move together within a specified historical window.", + "definition": "ts_covariance(y, x, d)" + }, + { + "name": "ts_decay_linear", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Applies a linear decay to time-series data over a set number of days, smoothing the data by averaging recent values and reducing the impact of older or missing data.", + "definition": "ts_decay_linear(x, d, dense = false)" + }, + { + "name": "ts_product", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns the product of the values of x over the past d days. Useful for calculating geometric means and compounding returns or growth rates.", + "definition": "ts_product(x, d)" + }, + { + "name": "ts_regression", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns various parameters related to regression function", + "definition": "ts_regression(y, x, d, lag = 0, rettype = 0)" + }, + { + "name": "ts_delta_limit", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Limit the change in the Alpha position x between dates to a specified fraction of y. The \"limit_volume\" can be in the range of 0 to 1. Also, please be aware of the scaling for x and y. Besides setting y as adv20 or volume related data, you can also set y as a constant.", + "definition": "ts_delta_limit(x, y, limit_volume=0.1)" + }, + { + "name": "ts_step", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns a counter of days, incrementing by one each day.", + "definition": "ts_step(1)" + }, + { + "name": "ts_decay_exp_window", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns exponential decay of x with smoothing factor for the past d days", + "definition": "ts_decay_exp_window(x, d, factor = f)" + }, + { + "name": "ts_quantile", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Calculates the ts_rank of the input and transforms it using the inverse cumulative distribution function (quantile function) of a specified probability distribution (default: Gaussian/normal). This helps to normalize or reshape the distribution of your data over a rolling window.", + "definition": "ts_quantile(x,d, driver=\"gaussian\" )" + }, + { + "name": "days_from_last_change", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Calculates the number of days since the last change in the value of a given variable.", + "definition": "days_from_last_change(x)" + }, + { + "name": "hump", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Limits amount and magnitude of changes in input (thus reducing turnover)", + "definition": "hump(x, hump = 0.01)" + }, + { + "name": "last_diff_value", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns the most recent value of x from the past d days that is different from the current value of x.", + "definition": "last_diff_value(x, d)" + }, + { + "name": "ts_arg_max", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns the number of days since the maximum value occurred in the last d days of a time series. If today's value is the maximum, returns 0; if it was yesterday, returns 1, and so on.", + "definition": "ts_arg_max(x, d)" + }, + { + "name": "ts_arg_min", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns the number of days since the minimum value occurred in a time series over the past d days. If today's value is the minimum, returns 0; if it was yesterday, returns 1, and so on.", + "definition": "ts_arg_min(x, d)" + }, + { + "name": "ts_av_diff", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Calculates the difference between a value and its mean over a specified period, ignoring NaN values in the mean calculation. In short, it returns x – ts_mean(x, d) with NaNs ignored.", + "definition": "ts_av_diff(x, d)" + }, + { + "name": "ts_backfill", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Replaces missing (NaN) values in a time series with the most recent valid value from a specified lookback window, improving data coverage and reducing risk from missing data.", + "definition": "ts_backfill(x,lookback = d, k=1)" + }, + { + "name": "ts_rank", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Ranks the value of a variable for each instrument over a specified number of past days, returning the rank of the current value (optionally adjusted by a constant). Useful for normalizing time-series data and highlighting relative performance over time.", + "definition": "ts_rank(x, d, constant = 0)" + }, + { + "name": "ts_delay", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Returns the value of a variable x from d days ago. Use this operator to access historical data points by specifying the desired time lag in days.", + "definition": "ts_delay(x, d)" + }, + { + "name": "ts_delta", + "category": "Time Series", + "scope": "['REGULAR']", + "description": "Calculates the difference between a value and its delayed version over a specified period. Useful for measuring changes or momentum in time-series data.", + "definition": "ts_delta(x, d)" + }, + { + "name": "winsorize", + "category": "Cross Sectional", + "scope": "['REGULAR']", + "description": "Winsorizes x to make sure that all values in x are between the lower and upper limits, which are specified as multiple of std.\r\n\r\nInput: Value of 7 instruments at day t: (2, 4, 5, 6, 3, 8, 10), std: 1\r\nOutput: (2.81, 4, 5, 6, 3, 8, 8.03) from SD. = 2.61, mean = 5.42", + "definition": "winsorize(x, std=4)" + }, + { + "name": "truncate", + "category": "Cross Sectional", + "scope": "['REGULAR']", + "description": "Operator truncates all values of x to maxPercent. Here, maxPercent is in decimal notation", + "definition": "truncate(x,maxPercent=0.01)" + }, + { + "name": "regression_neut", + "category": "Cross Sectional", + "scope": "['REGULAR']", + "description": "Conducts the cross-sectional regression on the stocks with Y as target and X as the independent variable", + "definition": "regression_neut(y, x)" + }, + { + "name": "scale", + "category": "Cross Sectional", + "scope": "['REGULAR']", + "description": "Scales input to booksize. We can also scale the long positions and short positions to separate scales by mentioning additional parameters to the operator", + "definition": "scale(x, scale=1, longscale=1, shortscale=1)" + }, + { + "name": "rank", + "category": "Cross Sectional", + "scope": "['REGULAR']", + "description": "Ranks the input among all the instruments and returns an equally distributed number between 0.0 and 1.0. For precise sort, use the rate as 0", + "definition": "rank(x, rate=2)" + }, + { + "name": "quantile", + "category": "Cross Sectional", + "scope": "['REGULAR']", + "description": "Rank the raw vector, shift the ranked Alpha vector, apply distribution (gaussian, cauchy, uniform). If driver is uniform, it simply subtract each Alpha value with the mean of all Alpha values in the Alpha vector", + "definition": "quantile(x, driver = gaussian, sigma = 1.0)" + }, + { + "name": "normalize", + "category": "Cross Sectional", + "scope": "['REGULAR']", + "description": "Calculates the mean value of all valid alpha values for a certain date, then subtracts that mean from each element", + "definition": "normalize(x, useStd = false, limit = 0.0)" + }, + { + "name": "zscore", + "category": "Cross Sectional", + "scope": "['REGULAR']", + "description": "Z-score is a numerical measurement that describes a value's relationship to the mean of a group of values. Z-score is measured in terms of standard deviations from the mean", + "definition": "zscore(x)" + }, + { + "name": "vec_min", + "category": "Vector", + "scope": "['REGULAR']", + "description": "Minimum value form vector field x", + "definition": "vec_min(x)" + }, + { + "name": "vec_count", + "category": "Vector", + "scope": "['REGULAR']", + "description": "Number of elements in vector field x", + "definition": "vec_count(x)" + }, + { + "name": "vec_stddev", + "category": "Vector", + "scope": "['REGULAR']", + "description": "Standard Deviation of vector field x", + "definition": "vec_stddev(x)" + }, + { + "name": "vec_range", + "category": "Vector", + "scope": "['REGULAR']", + "description": "Difference between maximum and minimum element in vector field x", + "definition": "vec_range(x)" + }, + { + "name": "vec_avg", + "category": "Vector", + "scope": "['REGULAR']", + "description": "Taking mean of the vector field x\r\n\r\nInput: Vector of value of 1 instrument in a day: (2, 3, 5, 6, 3, 8, 10)\r\nOutput: 37 / 7 = 5.29", + "definition": "vec_avg(x)" + }, + { + "name": "vec_sum", + "category": "Vector", + "scope": "['REGULAR']", + "description": "Sum of vector field x\r\n\r\nInput: Vector of value of 1 instrument in a day: (2, 3, 5, 6, 3, 8, 10)\r\nOutput: 2 + 3 + 5 + 6 + 3 + 8 + 10 = 37", + "definition": "vec_sum(x)" + }, + { + "name": "vec_max", + "category": "Vector", + "scope": "['REGULAR']", + "description": "Maximum value form vector field x", + "definition": "vec_max(x)" + }, + { + "name": "left_tail", + "category": "Transformational", + "scope": "['REGULAR']", + "description": "NaN everything greater than maximum, maximum should be constant", + "definition": "left_tail(x, maximum = 0)" + }, + { + "name": "trade_when", + "category": "Transformational", + "scope": "['REGULAR']", + "description": "Used in order to change Alpha values only under a specified condition and to hold Alpha values in other cases. It also allows to close Alpha positions (assign NaN values) under a specified condition", + "definition": "trade_when(x, y, z)" + }, + { + "name": "right_tail", + "category": "Transformational", + "scope": "['REGULAR']", + "description": "NaN everything less than minimum, minimum should be constant", + "definition": "right_tail(x, minimum = 0)" + }, + { + "name": "bucket", + "category": "Transformational", + "scope": "['REGULAR']", + "description": "Convert float values into indexes for user-specified buckets. Bucket is useful for creating group values, which can be passed to GROUP as input", + "definition": "bucket(rank(x), range=\"0, 1, 0.1\" or buckets = \"2,5,6,7,10\")" + }, + { + "name": "group_rank", + "category": "Group", + "scope": "['REGULAR']", + "description": "Each elements in a group is assigned the corresponding rank in this group", + "definition": "group_rank(x, group)" + }, + { + "name": "group_cartesian_product", + "category": "Group", + "scope": "['REGULAR']", + "description": "Merge two groups into one group. If originally there are len_1 and len_2 group indices in g1 and g2, there will be len_1 * len_2 indices in the new group.", + "definition": "group_cartesian_product(g1, g2)" + }, + { + "name": "group_backfill", + "category": "Group", + "scope": "['REGULAR']", + "description": "If a certain value for a certain date and instrument is NaN, from the set of same group instruments, calculate winsorized mean of all non-NaN values over last d days", + "definition": "group_backfill(x, group, d, std = 4.0)" + }, + { + "name": "group_mean", + "category": "Group", + "scope": "['REGULAR']", + "description": "All elements in group equals to the mean", + "definition": "group_mean(x, weight, group)" + }, + { + "name": "group_neutralize", + "category": "Group", + "scope": "['REGULAR']", + "description": "Neutralizes Alpha against groups. These groups can be subindustry, industry, sector, country or a constant", + "definition": "group_neutralize(x, group)" + }, + { + "name": "group_normalize", + "category": "Group", + "scope": "['REGULAR']", + "description": "Normalizes input such that each group's absolute sum is 1", + "definition": "group_normalize(x, group, constantCheck=False, tolerance=0.01, scale=1)" + }, + { + "name": "group_median", + "category": "Group", + "scope": "['REGULAR']", + "description": "All elements in group equals to the median value of the group.", + "definition": "group_median(x, group)" + }, + { + "name": "group_scale", + "category": "Group", + "scope": "['REGULAR']", + "description": "Normalizes the values in a group to be between 0 and 1. (x - groupmin) / (groupmax - groupmin)", + "definition": "group_scale(x, group)" + }, + { + "name": "group_zscore", + "category": "Group", + "scope": "['REGULAR']", + "description": "Calculates group Z-score - numerical measurement that describes a value's relationship to the mean of a group of values. Z-score is measured in terms of standard deviations from the mean. zscore = (data - mean) / stddev of x for each instrument within its group.\r\n\r\nInput: Value of 5 instruments of Group A: (100, 0, 50, 60, 25)\r\nOutput: (1.57, -1.39, 0.09, 0.39, -0.65)", + "definition": "group_zscore(x, group)" + } +] + +CRITICAL OUTPUT RULES (to ensure implement_idea.py can generate expressions): +- Every Implementation Example MUST be a Python format template using `{variable}`. +- Every `{variable}` MUST be constructed from the actual field suffixes provided in the fields list. Do NOT invent variable names. +- The suffix must match exactly how it appears in the field ID after removing the dataset prefix (e.g., for `anl14_mean_roe_fy1`, use `{mean_roe_fy1}`, not `{mean_roe}` or `{roe}`). +- When you implement ideas, ONLY use operators from allowed_operators provided. +- Do NOT include dataset codes/prefixes/horizons in `{variable}` beyond the suffix itself. +- If you show raw field ids in tables, use backticks `` `like_this` ``, NOT `{braces}`. +- Include these metadata lines verbatim somewhere near the top: + **Dataset**: + **Region**: + **Delay**: + +--- +## EVENT FIELD IDENTIFICATION (CRITICAL FOR ts_* OPERATORS) + +**Event fields are NOT continuous daily data. They only have values on specific dates (earnings announcements, analyst revisions, etc.) and are NaN on other days.** + +### Quick Field Type Classification Method + +**Step 1: Check Dataset Prefix** +- `anl*` (analyst data), `fnd*` (fundamental data), `ern*` (earnings data) → **Likely EVENT fields** +- `mdl*` (model data), `pv*` (provider data), `nws*` (news data) → **Likely EVENT fields** +- `oth*` (other data) → **Requires further analysis (see Step 2)** + +**Step 2: Analyze Field Description Keywords** +- **CONTINUOUS fields indicators**: "predicted", "confidence", "score", "daily", "continuous", "return", "probability", "label" +- **EVENT fields indicators**: "estimate", "guidance", "revision", "announcement", "quarterly", "fiscal", "surprise", "consensus", "actual" + +**Step 3: Check Time Window Suffixes** +- `_fy1`, `_fy2`, `_fp1`, `_fp2`, `_qtr`, `_ttm` → **EVENT fields** (fiscal year/period markers) +- `_d`, `_ret`, `_prob`, `_label`, `_score` → **CONTINUOUS fields** (daily values) + +### Detailed Event Field Identification Rules + +**How to identify event fields from the fields list:** +1. Field description contains words like: "surprise", "announcement", "revision", "event", "post", "pre", "consensus", "actual", "fiscal", "quarterly" (when it's a point-in-time value) +2. Field name contains patterns like: `_surprise`, `_event`, `_revision`, `_consensus`, `_actual`, `_pre`, `_post`, `_announcement`, `_date`, `_flag` +3. Fields representing: earnings surprises, analyst revisions, consensus estimates before/after events, recommendation changes, special items, one-time adjustments + +**Examples from typical datasets:** +- `presurprise`, `actsurprise` → event data (surprise only on earnings date) +- `aftercons_mean`, `beforecons_mean` → event data (snapshots around earnings) +- `estsup`, `estsdown` → event data (revision counts, not daily values) +- `xoptq`, `pncq`, `spceq` → event data (quarterly updates, not daily) +- **NEW**: `oth566_return`, `oth566_prob_*`, `oth566_label_*` → **CONTINUOUS data** (ML predictions with daily values) + +### CRITICAL RULES + +**Rule 1: ts_* Operators Restriction** +- `ts_*` operators (ts_mean, ts_std_dev, ts_zscore, ts_delta, ts_sum, ts_rank, etc.) can ONLY be used with continuous daily fields +- **DO NOT** use `ts_*` operators on event fields +- **Exception**: `ts_delay(event_field, days)` is allowed to access historical event values + +**Rule 2: Arithmetic Operators on Event Fields** +- `add`, `subtract`, `multiply`, `divide` → **SAFE** for event fields in cross-sectional calculations (same-day operations) +- Example: `divide(anl16_meanest, anl16_eststddev)` is valid (z-score calculation on same day) + +**Rule 3: Pattern-Based Classification** +Based on historical error analysis, fields matching these patterns are EVENT fields: +``` +Dataset prefixes: anl*, fnd*, ern*, mdl*, pv*, nws* +Field patterns: *_estimate_*, *_guidance_*, *_revision_*, *_event* +Time markers: *_qtr, *_fy1, *_fy2, *_fp1, *_fp2, *_ttm +``` + +**Safe alternatives for event fields:** +- Use event fields directly in ratios or cross-sectional comparisons +- Use `ts_delay(event_field, days)` to capture the last known event value +- Use event fields with `group_*` operators for cross-sectional analysis +--- + +# User Prompt + +{ + "instructions": { + "output_format": "Fill OUTPUT_TEMPLATE.md with concrete content.", + "implementation_examples": "Each Implementation Example must be a template with {variable} placeholders. Use only suffixes derived from the provided fields list. Always include time window suffixes (e.g., _fy1, _fp1) when present in the fields.", + "no_code_fences": true, + "do_not_invent_placeholders": true + }, + "dataset_context": { + "dataset_id": "biasfree_analyst", + "dataset_name": null, + "dataset_description": null, + "category": "Analyst", + "region": "USA", + "delay": 1, + "universe": "TOP200", + "field_count": 54 + }, + "fields": [ + { + "id": "third_biasfree_price_target_analogue", + "description": "The third bias-free analogue value for a price target forecast from an analyst." + }, + { + "id": "stddev_third_biasfree_price_target_estimate", + "description": "The standard deviation of the third bias-free price target estimate for the period." + }, + { + "id": "stddev_second_biasfree_quarterly_fundamental", + "description": "The standard deviation of the second bias-free quarterly fundamental estimate for the period." + }, + { + "id": "stddev_second_biasfree_price_target_estimate", + "description": "The standard deviation of the second bias-free price target estimate for the period." + }, + { + "id": "stddev_second_biasfree_fundamental_estimate", + "description": "The standard deviation of the second bias-free fundamental estimate for the period." + }, + { + "id": "stddev_first_biasfree_quarterly_fundamental", + "description": "The standard deviation of the first bias-free quarterly fundamental estimate for the period." + }, + { + "id": "stddev_first_biasfree_price_target_estimate", + "description": "The standard deviation of the first bias-free price target estimate for the period." + }, + { + "id": "stddev_first_biasfree_fundamental_estimate", + "description": "The standard deviation of the first bias-free fundamental estimate for the period." + }, + { + "id": "stddev_biasfree_quarterly_fundamental_estimate", + "description": "The standard deviation of bias-adjusted quarterly fundamental estimates for the period." + }, + { + "id": "stddev_bias_adjusted_price_target", + "description": "The standard deviation of bias-adjusted price target estimates for the period." + }, + { + "id": "stddev_bias_adjusted_fundamental_estimate", + "description": "The standard deviation of bias-adjusted fundamental estimates for the period." + }, + { + "id": "second_biasfree_price_target_analogue", + "description": "The second bias-free analogue value for a price target forecast from an analyst." + }, + { + "id": "second_biasfree_fundamental_analogue", + "description": "The second bias-free analogue value for a fundamental forecast from an analyst." + }, + { + "id": "num_upward_biasfree_quarterly_fundamental_revisions", + "description": "The number of times analysts have raised their bias-adjusted quarterly fundamental estimates." + }, + { + "id": "num_upward_biasfree_price_target_revisions", + "description": "The number of times analysts have raised their bias-adjusted price target estimates." + }, + { + "id": "num_upward_biasfree_fundamental_revisions", + "description": "The number of times analysts have raised their bias-adjusted fundamental estimates." + }, + { + "id": "num_downward_biasfree_quarterly_fundamental_revisions", + "description": "The number of times analysts have lowered their bias-adjusted quarterly fundamental estimates." + }, + { + "id": "num_downward_biasfree_price_target_revisions", + "description": "The number of times analysts have lowered their bias-adjusted price target estimates." + }, + { + "id": "num_downward_biasfree_fundamental_revisions", + "description": "The number of times analysts have lowered their bias-adjusted fundamental estimates." + }, + { + "id": "min_biasfree_quarterly_fundamental_estimate", + "description": "The lowest value among bias-adjusted quarterly fundamental estimates for the period." + }, + { + "id": "min_bias_adjusted_price_target", + "description": "The lowest value among bias-adjusted price target estimates for the period." + }, + { + "id": "min_bias_adjusted_fundamental_estimate", + "description": "The lowest value among bias-adjusted fundamental estimates for the period." + }, + { + "id": "median_third_biasfree_price_target_estimate", + "description": "The median value of the third bias-free price target estimate for the period." + }, + { + "id": "median_second_biasfree_quarterly_fundamental", + "description": "The median value of the second bias-free quarterly fundamental estimate for the period." + }, + { + "id": "median_second_biasfree_price_target_estimate", + "description": "The median value of the second bias-free price target estimate for the period." + }, + { + "id": "median_second_biasfree_fundamental_estimate", + "description": "The median value of the second bias-free fundamental estimate for the period." + }, + { + "id": "median_first_biasfree_quarterly_fundamental", + "description": "The median value of the first bias-free quarterly fundamental estimate for the period." + }, + { + "id": "median_first_biasfree_price_target_estimate", + "description": "The median value of the first bias-free price target estimate for the period." + }, + { + "id": "median_first_biasfree_fundamental_estimate", + "description": "The median value of the first bias-free fundamental estimate for the period." + }, + { + "id": "median_biasfree_quarterly_fundamental_estimate", + "description": "The median of bias-adjusted quarterly fundamental estimates for the period." + }, + { + "id": "median_bias_adjusted_price_target", + "description": "The median of bias-adjusted price target estimates for the period." + }, + { + "id": "median_bias_adjusted_fundamental_estimate", + "description": "The median of bias-adjusted fundamental estimates for the period." + }, + { + "id": "mean_biasfree_quarterly_fundamental_estimate", + "description": "The mean of bias-adjusted quarterly fundamental estimates for the period." + }, + { + "id": "mean_bias_adjusted_price_target", + "description": "The mean of bias-adjusted price target estimates for the period." + }, + { + "id": "mean_bias_adjusted_fundamental_estimate", + "description": "The mean of bias-adjusted fundamental estimates for the period." + }, + { + "id": "max_biasfree_quarterly_fundamental_estimate", + "description": "The highest value among bias-adjusted quarterly fundamental estimates for the period." + }, + { + "id": "max_bias_adjusted_price_target", + "description": "The highest value among bias-adjusted price target estimates for the period." + }, + { + "id": "max_bias_adjusted_fundamental_estimate", + "description": "The highest value among bias-adjusted fundamental estimates for the period." + }, + { + "id": "forecast_horizon_months", + "description": "The time horizon in months for which the price target estimate is made." + }, + { + "id": "first_biasfree_price_target_analogue", + "description": "The first bias-free analogue value for a price target forecast from an analyst." + }, + { + "id": "first_biasfree_fundamental_analogue", + "description": "The first bias-free analogue value for a fundamental forecast from an analyst." + }, + { + "id": "estimate_currency_code_9", + "description": "The currency in which the current fundamental estimate is recorded." + }, + { + "id": "count_biasfree_quarterly_fundamental_estimates", + "description": "The number of available bias-adjusted quarterly fundamental estimates for the period." + }, + { + "id": "count_bias_adjusted_price_target_estimates", + "description": "The number of available bias-adjusted price target estimates for the period." + }, + { + "id": "count_bias_adjusted_fundamental_estimates", + "description": "The number of available bias-adjusted fundamental estimates for the period." + }, + { + "id": "biasfree_analyst_price_target", + "description": "A single analyst's bias-adjusted price target estimate for a security." + }, + { + "id": "biasfree_analyst_fundamental_estimate", + "description": "A single analyst's bias-adjusted fundamental estimate for a security." + }, + { + "id": "avg_third_biasfree_price_target_estimate", + "description": "The average value of the third bias-free price target estimate for the period." + }, + { + "id": "avg_second_biasfree_quarterly_fundamental", + "description": "The average value of the second bias-free quarterly fundamental estimate for the period." + }, + { + "id": "avg_second_biasfree_price_target_estimate", + "description": "The average value of the second bias-free price target estimate for the period." + }, + { + "id": "avg_second_biasfree_fundamental_estimate", + "description": "The average value of the second bias-free fundamental estimate for the period." + }, + { + "id": "avg_first_biasfree_quarterly_fundamental", + "description": "The average value of the first bias-free quarterly fundamental estimate for the period." + }, + { + "id": "avg_first_biasfree_price_target_estimate", + "description": "The average value of the first bias-free price target estimate for the period." + }, + { + "id": "avg_first_biasfree_fundamental_estimate", + "description": "The average value of the first bias-free fundamental estimate for the period." + } +] +} \ No newline at end of file diff --git a/temporary_script/synchronize_alpha_performance_data.py b/temporary_script/synchronize_alpha_performance_data.py new file mode 100644 index 0000000..f35aed4 --- /dev/null +++ b/temporary_script/synchronize_alpha_performance_data.py @@ -0,0 +1,366 @@ +# -*- coding: utf-8 -*- +import json +import httpx +import random +import time +from datetime import datetime, timedelta + + +class OdooClient: + """Odoo JSON-RPC 客户端类""" + + def __init__(self, url, db_name, username, password): + """ + 初始化 Odoo 客户端并自动登录 + + Args: + url: Odoo 服务器地址 + db_name: 数据库名称 + username: 用户名 + password: 密码 + """ + self.url = url + self.db_name = db_name + self.username = username + self.password = password + self.client = None + self.uid = None + + # 自动登录 + self.login() + + def login(self): + """登录并获取 uid 和 client""" + try: + self.client = httpx.Client(timeout=30.0) + + # Odoo 登录 + payload = { + "jsonrpc": "2.0", + "method": "call", + "params": { + "service": "common", + "method": "login", + "args": [self.db_name, self.username, self.password] + }, + "id": 1 + } + + response = self.client.post(f"{self.url}/jsonrpc", json=payload) + result = response.json() + + # 检查是否有错误 + if "error" in result: + raise Exception(f"登录失败: {result['error']}") + + # Odoo 的登录响应中,result 直接就是 uid + self.uid = result.get("result") + + if not self.uid: + raise Exception("登录失败:未获取到UID") + + print(f"登录成功,UID: {self.uid}") + return True + + except Exception as e: + print(f"登录失败: {e}") + if self.client: + self.client.close() + self.client = None + self.uid = None + raise + + def logout(self): + """退出登录并关闭连接""" + if self.client: + self.client.close() + self.client = None + self.uid = None + print("已退出") + + def search_data(self, model, domain, fields=None, order=None, limit=None): + """ + 通用搜索方法 + + Args: + model: 模型名称 + domain: 搜索条件列表 + fields: 需要返回的字段列表 + order: 排序规则 + limit: 返回记录数量限制 + + Returns: + 查询结果列表,失败返回 None + """ + if not self.client or not self.uid: + raise Exception("未登录或连接已断开") + + # 构建参数 + args = [domain] + if fields: + args.append(fields) + + kwargs = {} + if order: + kwargs['order'] = order + if limit: + kwargs['limit'] = limit + + payload = { + "jsonrpc": "2.0", + "method": "call", + "params": { + "service": "object", + "method": "execute_kw", + "args": [ + self.db_name, + self.uid, + self.password, + model, + "search_read", + args, + kwargs + ] + }, + "id": 2 + } + + try: + response = self.client.post(f"{self.url}/jsonrpc", json=payload) + result = response.json() + + if "error" in result: + print(f"查询失败: {result['error']}") + return None + + return result.get("result", []) + + except Exception as e: + print(f"查询异常: {e}") + return None + + def write_data(self, model, record_id, values): + """ + 更新记录 + + Args: + model: 模型名称 + record_id: 记录ID + values: 要更新的字段值字典 + + Returns: + 是否更新成功 + """ + if not self.client or not self.uid: + raise Exception("未登录或连接已断开") + + payload = { + "jsonrpc": "2.0", + "method": "call", + "params": { + "service": "object", + "method": "execute_kw", + "args": [ + self.db_name, + self.uid, + self.password, + model, + "write", + [[record_id], values] + ] + }, + "id": 3 + } + + try: + response = self.client.post(f"{self.url}/jsonrpc", json=payload) + result = response.json() + + if "error" in result: + print(f"更新失败: {result['error']}") + return False + + return result.get("result", False) + + except Exception as e: + print(f"更新异常: {e}") + return False + + def __enter__(self): + """上下文管理器入口""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """上下文管理器出口,自动登出""" + self.logout() + + +class SimpleAlphaFetcher: + def __init__(self): + """ + 初始化 Alpha 获取器 + """ + self.client = None + self.login() + + def login(self): + """登录 WorldQuant Brain API""" + try: + # 从 nacos 获取账号密码 + with httpx.Client(timeout=10.0) as temp_client: + nacos_resp = temp_client.get( + 'http://192.168.31.41:30848/nacos/v1/cs/configs?dataId=wq_account&group=quantify' + ) + + if nacos_resp.status_code != 200: + print('获取账号密码失败') + return False + + config = nacos_resp.json() + username = config.get('user_name') + password = config.get('password') + + if not username or not password: + print('账号密码不完整') + return False + + print(f"正在登录账户: {username}") + + # 创建客户端并设置超时 + timeout = httpx.Timeout(connect=30.0, read=60.0, write=30.0, pool=30.0) + self.client = httpx.Client( + auth=httpx.BasicAuth(username, password), + timeout=timeout + ) + + # 发送登录请求 + response = self.client.post('https://api.worldquantbrain.com/authentication') + + if response.status_code == 201: + print("登录成功!") + return True + else: + print(f"登录失败: {response.status_code} - {response.text}") + self.client.close() + self.client = None + return False + + except Exception as e: + print(f"登录异常: {e}") + return False + + def get_alpha_detail(self, alpha_id): + """ + 获取 Alpha 详细信息,带3次重试 + + Args: + alpha_id: Alpha ID + + Returns: + Alpha 详细信息字典,失败返回 None + """ + if not self.client: + print("客户端未初始化") + return None + + url = f"https://api.worldquantbrain.com/alphas/{alpha_id}" + + for attempt in range(3): + try: + response = self.client.get(url) + if response.status_code == 200: + return response.json() + else: + print(f"获取 Alpha 失败 (尝试 {attempt + 1}/3): {response.status_code} - {response.text}") + except Exception as e: + print(f"获取 Alpha 异常 (尝试 {attempt + 1}/3): {e}") + + if attempt < 2: + sleep_time = random.uniform(5, 8) + print(f"等待 {sleep_time:.1f} 秒后重试...") + time.sleep(sleep_time) + + return None + + def logout(self): + """退出登录""" + if self.client: + self.client.close() + self.client = None + print("Alpha 客户端已退出") + + +# 使用 OdooClient 类重构后的函数 +def fetch_local_performance(odoo_client, model, domain, fields, limit): + """获取本地表现数据""" + # 执行搜索 + result = odoo_client.search_data(model=model, domain=domain, fields=fields, order="id desc", limit=limit) + + if result: + return result + else: + print("未获取到数据") + exit(1) + + +# 使用示例 +if __name__ == "__main__": + # ============================== Odoo 连接配置 ==================================== + ODOO_URL = "http://192.168.31.41:32000" + DB_NAME = "quantify" + USERNAME = "rpc" + PASSWORD = "aaaAAA111" + + # ============================== 搜索设置 ==================================== + days = 7 + now = datetime.now() + today_zero = now.replace(hour=0, minute=0, second=0, microsecond=0) + days_ago_zero = today_zero - timedelta(days=days) + time_range = ('write_date', '>=', days_ago_zero.strftime('%Y-%m-%d %H:%M:%S')) + # 模型名称 + model = "alpha.expression.line" + # 搜索条件 + domain = [('status', '=', 'success'), ('performance', '=', '{}')] + # 搜索字段 + fields = ['alpha_id'] + # 搜索数量限制 + limit = 1 + + + try: + with OdooClient(ODOO_URL, DB_NAME, USERNAME, PASSWORD) as odoo: + all_data = fetch_local_performance(odoo, model, domain, fields, limit) + + # 初始化 Alpha 获取器 + alpha_fetcher = SimpleAlphaFetcher() + + try: + for data in all_data: + alpha_expression_line_id = data.get('id') + alpha_id = data.get('alpha_id') + + print(f'正在处理: {alpha_id}') + + if not alpha_id: + print(f"记录 {alpha_expression_line_id} 没有 alpha_id,跳过") + continue + + # 获取 Alpha 详细信息 + alpha_detail = alpha_fetcher.get_alpha_detail(alpha_id) + + if alpha_detail: + # 反写到 Odoo + update_values = {'performance': json.dumps(alpha_detail, indent=4, ensure_ascii=False)} + success = odoo.write_data(model, alpha_expression_line_id, update_values) + if success: + print(f"成功更新记录 {alpha_expression_line_id}") + else: + print(f"更新记录 {alpha_expression_line_id} 失败") + else: + print(f"获取 Alpha {alpha_id} 失败,跳过该记录") + + finally: + alpha_fetcher.logout() + + except Exception as e: + print(f"程序执行失败: {e}") \ No newline at end of file diff --git a/test/wqb-login/login.py b/test/wqb-login/login.py index 1ba13e2..a4f2445 100644 --- a/test/wqb-login/login.py +++ b/test/wqb-login/login.py @@ -2,7 +2,7 @@ import httpx from httpx import BasicAuth -def login(credentials_file='account.txt'): +def login(): """登录WorldQuant Brain API""" # 从nacos获取账号密码 nacos_resp = httpx.get('http://192.168.31.41:30848/nacos/v1/cs/configs?dataId=wq_account&group=quantify')