import requests import json import sys import asyncio import openai import re from typing import Optional, Union # Added this import try: from .validator_hooks import is_valid_template_expr, has_empty_datafield_candidates except Exception: # Fallback for direct script execution try: from validator_hooks import is_valid_template_expr, has_empty_datafield_candidates except Exception: is_valid_template_expr = None has_empty_datafield_candidates = None # --- Validation wrappers to integrate into the pipeline --- def _filter_valid_templates( proposed_templates: dict, operators_meta, brain_session, settings: dict, parse_alpha_code_func, ): """Return dict of only templates that pass validation. Safe no-op if validation helpers are unavailable. """ if not is_valid_template_expr or not parse_alpha_code_func: return proposed_templates filtered = {} for template_expr, template_expl in proposed_templates.items(): try: if is_valid_template_expr( template_expr, operators_meta, brain_session, settings, parse_alpha_code_func, ): filtered[template_expr] = template_expl except Exception: # Be conservative: drop on exceptions continue return filtered def _should_skip_due_to_empty_candidates(populated_info: dict) -> bool: """True if any data_field placeholder has zero candidates. Safe no-op fallback when helper is missing. """ if not has_empty_datafield_candidates: return False try: return has_empty_datafield_candidates(populated_info) except Exception: return False import logging import pandas as pd import os from pathlib import Path from urllib.parse import urljoin import time import threading import itertools import getpass import io import validator as val from ace_lib import get_instrument_type_region_delay # Force stdout/stderr to use utf-8 on Windows to avoid UnicodeEncodeError if sys.platform.startswith('win'): try: sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8') except Exception: pass # 这些变量将在交互式输入中设置 LLM_model_name = None LLM_API_KEY = None llm_base_url = None username = None password = None DATA_CATEGORIES = None # 加载模板总结文件 template_summary_path = os.path.join(os.path.dirname(__file__), "template_summary.md") try: with open(template_summary_path, "r", encoding="utf-8") as f: template_summary = f.read() print(f"✓ 已加载模板总结文件: {template_summary_path}", flush=True) except FileNotFoundError: print(f"⚠ 模板总结文件不存在: {template_summary_path},使用内置模板", flush=True) template_summary = """# BRAIN论坛Alpha模板精华总结 请创建 template_summary.md 文件""" except Exception as e: print(f"⚠ 加载模板总结文件失败: {e},使用内置模板", flush=True) template_summary = """# BRAIN论坛Alpha模板精华总结 请检查 template_summary.md 文件""" class SingleSession(requests.Session): _instance = None _lock = threading.Lock() _relogin_lock = threading.Lock() _initialized = False def __new__(cls, *args, **kwargs): if cls._instance is None: with cls._lock: if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def __init__(self, *args, **kwargs): if not self._initialized: super(SingleSession, self).__init__(*args, **kwargs) self._initialized = True def get_relogin_lock(self): return self._relogin_lock def load_template_summary(file_path: Optional[str] = None) -> str: """ Loads the template summary from a file or returns the built-in template summary. Args: file_path: Optional path to a .txt or .md file containing the template summary. If None or file doesn't exist, returns the built-in template summary. Returns: str: The template summary content. """ if file_path: try: file_path_obj = Path(file_path) if file_path_obj.exists() and file_path_obj.is_file(): with open(file_path_obj, 'r', encoding='utf-8') as f: content = f.read() print(f"✓ 成功从文件加载模板总结: {file_path}", flush=True) return content else: print(f"⚠ 警告: 文件不存在: {file_path},将使用内置模板总结", flush=True) except Exception as e: print(f"⚠ 警告: 读取文件时出错: {e},将使用内置模板总结", flush=True) # 返回内置的模板总结 print("✓ 使用内置模板总结", flush=True) return template_summary def get_credentials() -> tuple[str, str]: """ Retrieve or prompt for platform credentials. This function attempts to read credentials from a JSON file in the user's home directory. If the file doesn't exist or is empty, it prompts the user to enter credentials and saves them. Returns: tuple: A tuple containing the email and password. Raises: json.JSONDecodeError: If the credentials file exists but contains invalid JSON. """ # 声明使用全局变量 global username, password # please input your own BRAIN Credentials into the function return (username, password) def get_token_from_auth_server() -> str: # 声明使用全局变量 global LLM_API_KEY # please input your own LLM Gateway token into the function, please note, we are using kimi-k2.5 model return LLM_API_KEY def interactive_input() -> dict: """ 交互式输入函数,收集所有必要的配置信息。 Returns: dict: 包含所有配置信息的字典 """ print("\n" + "="*60, flush=True) print("欢迎使用 Alpha Transformer 交互式配置", flush=True) print("此程序在于让您输入一个Alpha ID即可通过历史总结的Alpha模板,转化成更多的表达式", flush=True) print("72变,助您腾云驾雾", flush=True) print("如果你想修改模型,则可以使用新模型的url和api key", flush=True) print("不同模型效果不同,默认的kimi可能会产生语法错误,请检查生成的模板文件进行甄别", flush=True) print("强烈推荐你使用自己总结的模板文档,效果会更好", flush=True) print("="*60 + "\n", flush=True) config = {} # 1. 询问 LLM 模型名称 print("【1/6】LLM 模型配置", flush=True) print("如果你想修改模型,则可以使用新模型的名称", flush=True) default_model = "kimi-k2.5" model_input = input(f"请输入 LLM 模型名称 (直接回车使用默认值: {default_model}): ").strip() config['LLM_model_name'] = model_input if model_input else default_model print(f"✓ LLM 模型名称: {config['LLM_model_name']}\n", flush=True) # 2. 询问 LLM API Key print("【2/6】LLM API Key 配置", flush=True) api_key = getpass.getpass("请输入 LLM API Key (输入时不会显示): ").strip() if not api_key: print("⚠ 警告: API Key 为空,程序可能无法正常工作", flush=True) config['LLM_API_KEY'] = api_key print("✓ API Key 已设置\n", flush=True) # 3. 询问 LLM Base URL print("【3/6】LLM Base URL 配置", flush=True) print("提示:不同模型有不同的URL", flush=True) default_url = "https://api.moonshot.cn/v1" url_input = input(f"请输入 LLM Base URL (直接回车使用默认值: {default_url}): ").strip() config['llm_base_url'] = url_input if url_input else default_url print(f"✓ LLM Base URL: {config['llm_base_url']}\n", flush=True) # 4. 询问 BRAIN 平台用户名 print("【4/6】BRAIN 平台认证信息", flush=True) username_input = input("请输入 BRAIN 平台用户名/邮箱: ").strip() if not username_input: print("⚠ 警告: 用户名为空,程序可能无法正常工作", flush=True) config['username'] = username_input print("✓ 用户名已设置\n", flush=True) # 5. 询问 BRAIN 平台密码 password_input = getpass.getpass("请输入 BRAIN 平台密码 (输入时不会显示): ").strip() if not password_input: print("⚠ 警告: 密码为空,程序可能无法正常工作", flush=True) config['password'] = password_input print("✓ 密码已设置\n", flush=True) # 6. 询问模板总结文件路径 print("【5/6】模板总结文件配置", flush=True) print("强烈推荐你使用自己总结的模板文档,效果会更好", flush=True) print("提示: 如果您有 template_summary 的 .txt 或 .md 文件,请输入完整路径", flush=True) print(" 如果没有,直接回车将使用内置模板总结", flush=True) template_path = input("请输入模板总结文件路径 (直接回车使用内置模板): ").strip() config['template_summary_path'] = template_path if template_path else None if template_path: print(f"✓ 将尝试从文件加载: {template_path}\n", flush=True) else: print("✓ 将使用内置模板总结\n", flush=True) # 7. 询问 Alpha ID print("【6/7】Alpha ID 配置", flush=True) alpha_id = input("请输入要处理的 Alpha ID: ").strip() if not alpha_id: print("❌ 错误: Alpha ID 不能为空", flush=True) sys.exit(1) config['alpha_id'] = alpha_id print(f"✓ Alpha ID: {alpha_id}\n", flush=True) # 8. 询问 Top N 参数(仅数据字段) print("【7/7】候选数量配置 (Top N)", flush=True) print("提示: 此参数控制为每个占位符生成的数据字段候选数量", flush=True) # Datafield top_n default_datafield_topn = 50 datafield_topn_input = input(f"请输入数据字段候选数量 (直接回车使用默认值: {default_datafield_topn}): ").strip() try: config['top_n_datafield'] = int(datafield_topn_input) if datafield_topn_input else default_datafield_topn except ValueError: print(f"⚠ 警告: 输入无效,使用默认值: {default_datafield_topn}", flush=True) config['top_n_datafield'] = default_datafield_topn print(f"✓ 数据字段候选数量: {config['top_n_datafield']}\n", flush=True) print("="*60, flush=True) print("配置完成!开始处理...", flush=True) print("="*60 + "\n", flush=True) return config def expand_dict_columns(data: pd.DataFrame) -> pd.DataFrame: """ Expand dictionary columns in a DataFrame into separate columns. Args: data (pandas.DataFrame): The input DataFrame with dictionary columns. Returns: pandas.DataFrame: A new DataFrame with expanded columns. """ dict_columns = list(filter(lambda x: isinstance(data[x].iloc[0], dict), data.columns)) new_columns = pd.concat( [data[col].apply(pd.Series).rename(columns=lambda x: f"{col}_{x}") for col in dict_columns], axis=1, ) data = pd.concat([data, new_columns], axis=1) return data def start_session() -> SingleSession: """ Start a new session with the WorldQuant BRAIN platform. This function authenticates the user, handles biometric authentication if required, and creates a new session. Returns: SingleSession: An authenticated session object. Raises: requests.exceptions.RequestException: If there's an error during the authentication process. """ brain_api_url = "https://api.worldquantbrain.com" s = SingleSession() s.auth = get_credentials() r = s.post(brain_api_url + "/authentication") print(f"New session created (ID: {id(s)}) with authentication response: {r.status_code}, {r.json()} (新会话已创建)", flush=True) if r.status_code == requests.status_codes.codes.unauthorized: if r.headers["WWW-Authenticate"] == "persona": print( "Complete biometrics authentication and press any key to continue (请完成生物识别认证并按任意键继续): \n" + urljoin(r.url, r.headers["Location"]) + "\n" ) input() s.post(urljoin(r.url, r.headers["Location"])) while True: if s.post(urljoin(r.url, r.headers["Location"])).status_code != 201: input( "Biometrics authentication is not complete. Please try again and press any key when completed (生物识别认证未完成,请重试并按任意键): \n" ) else: break else: print("\nIncorrect email or password (邮箱或密码错误)\n", flush=True) return start_session() return s def get_data_categories(s: SingleSession) -> list[dict]: """ Fetch and cache data categories from the BRAIN API. """ global DATA_CATEGORIES if DATA_CATEGORIES is not None: return DATA_CATEGORIES try: brain_api_url = "https://api.worldquantbrain.com" response = s.get(brain_api_url + "/data-categories") response.raise_for_status() data = response.json() if isinstance(data, list): DATA_CATEGORIES = data elif isinstance(data, dict): DATA_CATEGORIES = data.get('results', []) else: DATA_CATEGORIES = [] return DATA_CATEGORIES except Exception as e: print(f"Error fetching data categories: {e}", flush=True) return [] def get_datafields( s: SingleSession, instrument_type: str = "EQUITY", region: str = "USA", delay: int = 1, universe: str = "TOP3000", theme: str = "false", dataset_id: str = "", data_type: str = "MATRIX", search: str = "", category: Union[str, list] = "", ) -> pd.DataFrame: """ Retrieve available datafields based on specified parameters. Args: s (SingleSession): An authenticated session object. instrument_type (str, optional): The type of instrument. Defaults to "EQUITY". region (str, optional): The region. Defaults to "USA". delay (int, optional): The delay. Defaults to 1. universe (str, optional): The universe. Defaults to "TOP3000". theme (str, optional): The theme. Defaults to "false". dataset_id (str, optional): The ID of a specific dataset. Defaults to "". data_type (str, optional): The type of data. Defaults to "MATRIX". search (str, optional): A search string to filter datafields. Defaults to "". category (str or list, optional): A category ID or list of IDs to filter datafields. Defaults to "". Returns: pandas.DataFrame: A DataFrame containing information about available datafields. """ brain_api_url = "https://api.worldquantbrain.com" type_param = f"&type={data_type}" if data_type != "ALL" else "" url_template = ( brain_api_url + "/data-fields?" + f"&instrumentType={instrument_type}" + f"®ion={region}&delay={str(delay)}&universe={universe}{type_param}&limit=50" ) if dataset_id: url_template += f"&dataset.id={dataset_id}" if len(search) > 0: url_template += f"&search={search}" url_template += "&offset={x}" count = 0 if len(search) == 0: try: count = s.get(url_template.format(x=0)).json()["count"] except Exception as e: print(f"Error getting count: {e}", flush=True) return pd.DataFrame() if count == 0: print( f"No fields found (未找到字段): region={region}, delay={str(delay)}, universe={universe}, " f"type={data_type}, dataset.id={dataset_id}" ) return pd.DataFrame() else: if category: count = 500 # Search deeper if filtering else: count = 100 max_try = 5 datafields_list = [] found_count = 0 target_found = 50 if category else count time.sleep(2) for x in range(0, count, 50): for _ in range(max_try): try: resp = s.get(url_template.format(x=x)) while resp.status_code == 429: print("status_code 429, sleep 3 seconds", flush=True) time.sleep(3) resp = s.get(url_template.format(x=x)) if resp.status_code == 200 and "results" in resp.json(): datafields = resp break except: pass time.sleep(5) else: continue results = datafields.json().get("results", []) if not results: break if category: if isinstance(category, list): filtered_results = [ item for item in results if isinstance(item.get('category'), dict) and item['category'].get('id') in category ] else: filtered_results = [ item for item in results if isinstance(item.get('category'), dict) and item['category'].get('id') == category ] datafields_list.append(filtered_results) found_count += len(filtered_results) if len(search) > 0 and found_count >= target_found: break else: datafields_list.append(results) datafields_list_flat = [item for sublist in datafields_list for item in sublist] if not datafields_list_flat: return pd.DataFrame() datafields_df = pd.DataFrame(datafields_list_flat) datafields_df = expand_dict_columns(datafields_df) return datafields_df def set_alpha_properties( s: SingleSession, alpha_id: str, name: Optional[str] = None, color: Optional[str] = None, regular_desc: Optional[str] = None, selection_desc: str = "None", combo_desc: str = "None", tags: Optional[list[str]] = None, ) -> requests.Response: """ Update the properties of an alpha. Args: s (SingleSession): An authenticated session object. alpha_id (str): The ID of the alpha to update. name (str, optional): The new name for the alpha. Defaults to None. color (str, optional): The new color for the alpha. Defaults to None. regular_desc (str, optional): Description for regular alpha. Defaults to None. selection_desc (str, optional): Description for the selection part of a super alpha. Defaults to "None". combo_desc (str, optional): Description for the combo part of a super alpha. Defaults to "None". tags (list, optional): List of tags to apply to the alpha. Defaults to None. Returns: requests.Response: The response object from the API call. """ brain_api_url = "https://api.worldquantbrain.com" params = {} if name is not None: params["name"] = name if color is not None: params["color"] = color if tags is not None: params["tags"] = tags if regular_desc is not None: params.setdefault("regular", {})["description"] = regular_desc if selection_desc != "None": # Assuming "None" is the default string value for selection_desc params.setdefault("selection", {})["description"] = selection_desc if combo_desc != "None": # Assuming "None" is the default string value for combo_desc params.setdefault("combo", {})["description"] = combo_desc response = s.patch(brain_api_url + "/alphas/" + alpha_id, json=params) return response def extract_placeholders(template_expression: str) -> list[str]: """ Extracts placeholders from a template expression using regular expressions. Placeholders are identified by text enclosed in angle brackets (e.g., ``). """ # Only match placeholders of the form `` or `` with alphanumeric and underscores return re.findall(r'(<[A-Za-z0-9_]+/>)', template_expression) def parse_alpha_code(alpha_code: str, all_operators: list[dict]) -> tuple[list[str], list[str]]: """ Parses the alpha code to extract operators and data fields. """ # Remove C-style comments /* ... */ alpha_code = re.sub(r"/\*[\s\S]*?\*/", "", alpha_code) # Remove Python-style comments # ... alpha_code = re.sub(r"#.*", "", alpha_code) operators_names = [op['name'] for op in all_operators] found_operators = [] found_datafields = [] # Regex to find potential identifiers (operators or datafields) # This regex looks for words that could be operators or datafields, # excluding numbers and common programming constructs. identifiers = re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', alpha_code) for identifier in identifiers: if identifier in operators_names: found_operators.append(identifier) elif not (identifier.isdigit() or identifier.lower() in ['true', 'false', 'null', 'nan', 'if', 'else', 'for', 'while', 'return', 'and', 'or', 'not', 'in', 'is', 'try', 'except', 'finally', 'with', 'as', 'def', 'class', 'import', 'from', 'yield', 'lambda', 'global', 'nonlocal', 'break', 'continue', 'pass', 'async', 'await', 'raise', 'assert', 'del', 'print', 'input', 'len', 'min', 'max', 'sum', 'abs', 'round', 'int', 'float', 'str', 'list', 'dict', 'set', 'tuple', 'range', 'map', 'filter', 'zip', 'open', 'file', 'type', 'id', 'dir', 'help', 'object', 'super', 'issubclass', 'isinstance', 'hasattr', 'getattr', 'setattr', 'delattr', '__import__', 'None', 'True', 'False']): found_datafields.append(identifier) # Remove duplicates found_operators = list(set(found_operators)) found_datafields = list(set(found_datafields)) return found_operators, found_datafields async def generate_alpha_description(alpha_id: str, brain_session: SingleSession) -> str: """ Generates and potentially enriches the description of a given Alpha ID from the WorldQuant BRAIN API. Args: alpha_id (str): The ID of the alpha to retrieve. brain_session (SingleSession): The active BRAIN API session. llm_client (openai.AsyncOpenAI): The authenticated OpenAI-compatible client. Returns: str: A JSON string containing the alpha's settings, expression, and potentially enriched description, or an empty JSON string if an error occurs. """ async def call_llm_new(prompt: str) -> dict: # 声明使用全局变量 global LLM_model_name, LLM_API_KEY, llm_base_url try: llm_api_key = get_token_from_auth_server() llm_base_url_value = llm_base_url # 使用全局变量 llm_client = openai.AsyncOpenAI(base_url=llm_base_url_value, api_key=llm_api_key) print("LLM Gateway Authentication successful. (LLM网关认证成功)", flush=True) except Exception as e: print(f"LLM Gateway Authentication failed (LLM网关认证失败): {e}", flush=True) sys.exit(1) print("--- Calling LLM to propose templates... (正在调用LLM生成模板...) ---", flush=True) try: # Await the async create call response = await llm_client.chat.completions.create( model=LLM_model_name, messages=[ {"role": "system", "content": "You are a quantitative finance expert and a helpful assistant designed to output JSON."}, {"role": "user", "content": prompt}, ], # response_format={"type": "json_object"}, ) # The async client may return a nested structure. Try to extract content robustly. content = None if isinstance(response, dict): # Some clients return raw dicts # Try common paths choices = response.get('choices') if choices and isinstance(choices, list): msg = choices[0].get('message') or choices[0] content = msg.get('content') if isinstance(msg, dict) else None elif 'content' in response: content = response.get('content') else: # Fallback: attempt attribute access try: content = response.choices[0].message.content except Exception: content = None if content is None: # As a last resort, try to stringify the response content = str(response) # If content is already a dict/list, return it directly; if it's a JSON string, parse it. if isinstance(content, (dict, list)): return content if isinstance(content, str): try: return json.loads(content) except json.JSONDecodeError: # Return wrapped string if not JSON return {"text": content} return {} except Exception as e: print(f"Error calling LLM (调用LLM出错): {e}", flush=True) return {} try: brain_api_url = "https://api.worldquantbrain.com" alpha_url = f"{brain_api_url}/alphas/{alpha_id}" response = brain_session.get(alpha_url) response.raise_for_status() # Raise an exception for HTTP errors alpha_data = response.json() settings = alpha_data.get('settings', {}) expression_dict = alpha_data.get('regular', alpha_data.get('combo', None)) if not expression_dict or 'code' not in expression_dict: print(f"Error: Alpha expression code not found for Alpha ID (未找到Alpha表达式代码): {alpha_id}", flush=True) return json.dumps({}) alpha_code = expression_dict['code'] current_description = expression_dict.get('description', '') # 1. Get all operators for parsing (no filter as per feedback) operators_data = get_brain_operators() all_operators = operators_data.get('operators', []) # 2. Parse the code to get operators and datafields found_operators_names, found_datafields_names = parse_alpha_code(alpha_code, all_operators) # 3. Get descriptions for operators operator_descriptions = {op['name']: op.get('description', 'No description available.') for op in all_operators if op['name'] in found_operators_names} # 4. Get descriptions for datafields datafield_descriptions = {} if found_datafields_names: # Extract settings from alpha_data for the get_datafields call instrument_type = settings.get('instrumentType', 'EQUITY') region = settings.get('region', 'USA') universe = settings.get('universe', 'TOP3000') delay = settings.get('delay', 1) for df_name in found_datafields_names: # get_datafields returns a DataFrame, so we need to process it datafield_df = get_datafields(s=brain_session, instrument_type=instrument_type, region=region, delay=delay, universe=universe, search=df_name) if not datafield_df.empty: # Assuming the first result is the most relevant datafield_descriptions[df_name] = datafield_df.iloc[0].get('description', 'No description available.') else: datafield_descriptions[df_name] = 'No description found.' # 5. Use LLM to judge if current description is good judgment_prompt = f""" Given the following alpha code, its current description, and descriptions of its operators and datafields: Alpha Code: {alpha_code} Current Description: {current_description} Operators and their descriptions: {json.dumps(operator_descriptions, indent=2)} Datafields and their descriptions: {json.dumps(datafield_descriptions, indent=2)} Alpha Settings: {json.dumps(settings, indent=2)} Is the current description good enough? Respond with 'yes' or 'no' in a JSON object: {{"judgment": "yes/no"}} A "good" description should clearly explain the investment idea, rationale for data used, and rationale for operators used. """ judgment_response = await call_llm_new(judgment_prompt) is_description_good = judgment_response.get("judgment", "no").lower() == "yes" new_description = current_description if not is_description_good: # 6. If not good, use another LLM to generate a new description generation_prompt = f""" Based on the following alpha code, its operators, datafields, and settings, generate a new, improved description. The description should clearly explain the investment idea, rationale for data used, and rationale for operators used. Format the output as: "Idea: xxxxx\\nRationale for data used: xxxxx\\nRationale for operators used: xxxxxxx" Alpha Code: {alpha_code} Operators and their descriptions: {json.dumps(operator_descriptions, indent=2)} Datafields and their descriptions: {json.dumps(datafield_descriptions, indent=2)} Alpha Settings: {json.dumps(settings, indent=2)} """ generated_description_response = await call_llm_new(generation_prompt) # Assuming LLM returns a string directly or a JSON with a 'description' key new_description = generated_description_response.get("description", generated_description_response) if isinstance(new_description, dict): # Handle cases where LLM might return a dict directly new_description = json.dumps(new_description, indent=2) # 7. Override this new description and patch the alpha set_alpha_properties( s=brain_session, alpha_id=alpha_id, regular_desc=new_description ) print(f"Alpha {alpha_id} description updated on platform. (Alpha描述已在平台更新)", flush=True) if 'regular' in alpha_data: alpha_data['regular']['description'] = new_description elif 'combo' in alpha_data: alpha_data['combo']['description'] = new_description return json.dumps({ 'settings': settings, 'expression': expression_dict }) except requests.exceptions.RequestException as e: print(f"Error during API request (API请求出错): {e}", flush=True) return json.dumps({}) except json.JSONDecodeError: print("Error: Could not decode JSON response from API. (无法解析API的JSON响应)", flush=True) return json.dumps({}) except Exception as e: print(f"An unexpected error occurred (发生意外错误): {e}", flush=True) return json.dumps({}) def get_brain_operators(scope_filters: Optional[list[str]] = None) -> dict: """ Retrieves the list of available operators from the WorldQuant BRAIN API, optionally filtered by a list of scopes. If no scopes are provided, all operators are returned. Args: scope_filters (list[str], optional): A list of strings to filter operators by their scope (e.g., ["REGULAR", "TS_OPERATOR"]). If None or empty, all operators are returned. Returns: dict: A dictionary containing the operators list and count, or an empty dictionary if an error occurs. """ try: brain_api_url = "https://api.worldquantbrain.com" session = start_session() operators_url = f"{brain_api_url}/operators" response = session.get(operators_url) response.raise_for_status() # Raise an exception for HTTP errors operators_list = response.json() if not isinstance(operators_list, list): print(f"Error: Expected a list of operators, but received type (预期运算符列表,但收到类型): {type(operators_list)}", flush=True) return {} if scope_filters: filtered_operators = [ op for op in operators_list if any(s_filter in op.get('scope', []) for s_filter in scope_filters) ] return { 'operators': filtered_operators, 'count': len(filtered_operators) } else: return { 'operators': operators_list, 'count': len(operators_list) } except requests.exceptions.RequestException as e: print(f"Error during API request for operators (获取运算符时API请求出错): {e}", flush=True) return {} except json.JSONDecodeError: print("Error: Could not decode JSON response from operators API. (无法解析运算符API的JSON响应)", flush=True) return {} except Exception as e: print(f"An unexpected error occurred while getting operators (获取运算符时发生意外错误): {e}", flush=True) return {} async def call_llm(prompt: str, llm_client: openai.AsyncOpenAI, max_retries: int = 3) -> dict: """ Interface with a Large Language Model to process prompts and get a JSON response. Includes retry logic for JSON parsing errors. """ # 声明使用全局变量 global LLM_model_name if not llm_client: print("LLM client not initialized. Please check authentication. (LLM客户端未初始化,请检查认证)", flush=True) return {} print("--- Calling LLM... (正在调用LLM...) ---", flush=True) for attempt in range(max_retries): try: response = await llm_client.chat.completions.create( model=LLM_model_name, # Or your preferred model messages=[ {"role": "system", "content": "You are a quantitative finance expert and a helpful assistant designed to output JSON."}, {"role": "user", "content": prompt}, ], # response_format={"type": "json_object"}, ) content = response.choices[0].message.content # Try to clean markdown code blocks if present if "```json" in content: content = content.split("```json")[1].split("```")[0].strip() elif "```" in content: content = content.split("```")[1].split("```")[0].strip() return json.loads(content) except json.JSONDecodeError as e: print(f"⚠ JSON Decode Error (Attempt {attempt + 1}/{max_retries}): {e}", flush=True) if attempt == max_retries - 1: print(f"❌ Failed to parse JSON after {max_retries} attempts. Raw content: {content[:100]}...", flush=True) except Exception as e: print(f"⚠ LLM Call Error (Attempt {attempt + 1}/{max_retries}): {e}", flush=True) if attempt == max_retries - 1: print(f"❌ Failed to call LLM after {max_retries} attempts.", flush=True) # Wait before retrying await asyncio.sleep(2) return {} async def propose_alpha_templates(alpha_details: dict, template_summary: str, llm_client: openai.AsyncOpenAI, user_data_type: str = "MATRIX") -> dict: """ Uses an LLM to propose new alpha templates based on a seed alpha's details. Args: alpha_details (dict): The details of the seed alpha. template_summary (str): A summary of alpha templates to guide the LLM. llm_client (openai.AsyncOpenAI): The authenticated OpenAI-compatible client. user_data_type (str): The data type for the alpha (MATRIX or VECTOR). Returns: dict: A dictionary of proposed alpha templates in JSON format. """ if not alpha_details.get('expression'): print("Error: Alpha expression is missing. (错误:缺少Alpha表达式)", flush=True) return {} else: print(f"current seed alpha detail (当前种子Alpha详情): {alpha_details.get('expression')}", flush=True) data_type_instruction = "" if user_data_type == "MATRIX": data_type_instruction = "\n**Important Note on Data Type:**\nThe user has specified the data type as **MATRIX**. Please do NOT use any vector-type operators (e.g., `vec_avg`, `vec_sum`) in your proposed templates, as they will raise errors for MATRIX type data in BRAIN. Note: 'MATRIX' is just a system identifier and does not refer to mathematical matrices." elif user_data_type == "VECTOR": data_type_instruction = "\n**Important Note on Data Type:**\nThe user has specified the data type as **VECTOR**. Please ensure you use vector-type operators (e.g., `vec_avg`, `vec_sum`) to handle the data fields before applying other operators." prompt = f""" As a world-class BRAIN consultant, your task is to design new alpha templates based on an existing seed alpha. You will be provided with the seed alpha's expression and a summary of successful alpha templates for inspiration. **Seed Alpha Expression:** {alpha_details['expression']} **Inspiration: Summary of Alpha Templates:** {template_summary} **Your Task:** Based on the structure and potential economic rationale of the seed alpha, by the aid of the Alpha template summary, propose 3-5 new, diverse alpha templates. **Rules:** 1. The proposed templates must be valid BRAIN alpha expressions. 2. Use placeholders like `` for data fields and `` for operators that can be programmatically replaced later. 3. For each proposed template, provide a brief, clear explanation of its investment rationale. 4. Return the output as a single, valid JSON object where keys are the proposed template strings and values are their corresponding explanations. Do not include any other text or formatting outside of the JSON object. 5. The proposed new alpha template should be related to the economic sense of seed Alpha {alpha_details} but in different format such as. Utilize the inspiration well. {data_type_instruction} **Example Output Format:** {{ "((, 60), industry)": "A cross-sectional momentum signal, neutralized by industry, to capture relative strength within peer groups.", "(, 20)": "A simple short-term momentum operator applied to a data field." }} Now, generate the JSON object with your proposed templates. """ try: print(f"\n[Step 1/5] 正在调用 LLM 生成 Alpha 模板...", flush=True) print(f" - 模型: {LLM_model_name}", flush=True) print(f" - 数据类型: {user_data_type}", flush=True) alpha_expr = alpha_details.get('expression', {}) if isinstance(alpha_expr, dict): alpha_expr = alpha_expr.get('code', 'N/A') print(f" - 种子 Alpha: {str(alpha_expr)[:50]}...", flush=True) # print(f"现在的template summary是{template_summary}") proposed_templates = await call_llm(prompt, llm_client) print(f"✓ LLM 返回 {len(proposed_templates)} 个模板提议", flush=True) return proposed_templates except Exception as e: print(f"An error occurred while calling the LLM (调用LLM时发生错误): {e}", flush=True) return {} async def propose_datafield_keywords(template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI, user_category: Optional[Union[str, list]] = None) -> list[str]: """ Uses an LLM to propose search keywords for finding data fields. """ category_instruction = "" if user_category: category_instruction = f"\n**User Specified Data Category:**\nThe user has specified the data category: {user_category}. Please ensure the proposed keywords are relevant to this category." else: category_instruction = "\n**Data Category:**\n Please propose keywords across diverse and relevant data categories." prompt = f""" As a quantitative researcher, you need to find the best data fields for an alpha template placeholder. Based on the template's logic and the placeholder's name, suggest a list of 3-5 concise search keywords to use with the WorldQuant BRAIN `get_datafields` tool. **Alpha Template:** `{template_expression}` **Template Explanation:** `{template_explanation}` **Placeholder to Fill:** `{placeholder}` {category_instruction} **Your Task:** Provide a list of search keywords that are likely to yield relevant data fields for this placeholder. The keywords should be specific and diverse. Return the output as a single, valid JSON array of strings. **Example Input:** Placeholder: `` Explanation: "measures the time-series evolution of a fund's relative rank on a slow-moving characteristic (e.g., fund style, expense tier)" **Example Output:** ["fund style", "expense ratio", "management fee", "turnover", "aum"] Now, generate the JSON array of search keywords for the given placeholder. """ print(f"--- Calling LLM to get keywords for placeholder (正在调用LLM获取占位符关键词): {placeholder} ---", flush=True) response = await call_llm(prompt, llm_client) print(f"AI使用如下提示词获取搜索关键词推荐:{prompt}", flush=True) # Accept either a direct list or a dict containing a 'keywords' key if isinstance(response, list) and all(isinstance(item, str) for item in response): return response if isinstance(response, dict): # Common keys that might contain the list for key in ('keywords', 'data', 'result', 'items'): if key in response and isinstance(response[key], list) and all(isinstance(i, str) for i in response[key]): return response[key] print(f"Warning: LLM did not return a valid list of strings for keywords (警告:LLM未返回有效的关键词列表). Got: {response}", flush=True) return [] async def get_datafield_candidates(s: SingleSession, alpha_details: dict, template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI, top_n: int = 50, user_region: Optional[str] = None, user_universe: Optional[str] = None, user_delay: Optional[int] = None, user_category: Optional[Union[str, list]] = None, user_data_type: str = "MATRIX") -> list[dict]: """ Gets candidate data fields for a placeholder by using an LLM to generate search keywords and then calling the BRAIN API's get_datafields to retrieve the top N results for each keyword. """ keywords = await propose_datafield_keywords(template_expression, template_explanation, placeholder, llm_client, user_category=user_category) if not keywords: print(f"Could not generate keywords for placeholder (无法生成占位符关键词): {placeholder}", flush=True) return [] print(f"LLM-proposed keywords for '{placeholder}' (LLM提议的关键词): {keywords}", flush=True) # Extract settings from alpha_details for the get_datafields call settings = alpha_details.get('settings', {}) print(f"Alpha settings for datafield search (用于数据字段搜索的Alpha设置):", flush=True) instrument_type = settings.get('instrumentType', 'EQUITY') if user_region: region = user_region elif 'region' in settings: region = settings['region'] else: print(f"❌ Error: Could not determine 'region' for datafield search. It is missing in Alpha settings and not provided by user. (错误:无法确定数据搜索的地区,Alpha设置中缺失且用户未提供)", flush=True) return [] print(f" 数据地区: {region}", flush=True) if user_universe: universe = user_universe elif 'universe' in settings: universe = settings['universe'] else: print(f"❌ Error: Could not determine 'universe' for datafield search. It is missing in Alpha settings and not provided by user. (错误:无法确定数据搜索的范围,Alpha设置中缺失且用户未提供)", flush=True) return [] print(f" 数据范围: {universe}", flush=True) if user_delay is not None: delay = user_delay elif 'delay' in settings: delay = settings['delay'] else: print(f"❌ Error: Could not determine 'delay' for datafield search. It is missing in Alpha settings and not provided by user. (错误:无法确定数据搜索的Delay,Alpha设置中缺失且用户未提供)", flush=True) return [] print(f" Delay: {delay} 类别", flush=True) if user_category: print(f" Category Filter: {user_category}", flush=True) # Use asyncio.gather to make parallel API calls for efficiency tasks = [] for keyword in keywords: tasks.append( asyncio.to_thread(get_datafields, s=s, instrument_type=instrument_type, region=region, delay=delay, universe=universe, search=keyword, category=user_category if user_category else "", data_type=user_data_type ) ) results = await asyncio.gather(*tasks) # Process results to get top N from each keyword search top_results_per_keyword = [] for res_df in results: if not res_df.empty: top_results_per_keyword.append(res_df.head(top_n)) candidate_datafields = [] if top_results_per_keyword: # Concatenate the top N results from all keywords combined_df = pd.concat(top_results_per_keyword, ignore_index=True) # Remove duplicates from the combined list combined_df.drop_duplicates(subset=['id'], inplace=True) # Format the final list of candidates candidate_datafields = combined_df[['id', 'description']].to_dict(orient='records') return candidate_datafields async def get_group_datafield_candidates(template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI, top_n: int = 3) -> list[dict]: """ Uses an LLM to select suitable group data fields from a predefined list. """ predefined_group_fields = ["industry", "subindustry", "sector", "market", "exchange"] prompt = f""" As a quantitative researcher, you need to select the most relevant group data fields for an alpha template placeholder. Based on the template's logic and the placeholder's name, select {top_n} group fields from the following list that are most suitable: {predefined_group_fields}. **Alpha Template:** `{template_expression}` **Template Explanation:** `{template_explanation}` **Placeholder to Fill:** `{placeholder}` **Your Task:** Provide a list of selected group data fields. Return the output as a single, valid JSON array of strings. **Example Output Format:** ["industry", "sector"] Now, generate the JSON array of selected group data fields. """ print(f"--- Calling LLM to select group datafields for placeholder (正在调用LLM选择分组数据字段): {placeholder} ---", flush=True) response = await call_llm(prompt, llm_client) if isinstance(response, list) and all(isinstance(item, str) for item in response): return [{"name": field} for field in response[:top_n]] print(f"Warning: LLM did not return a valid list of strings for group datafields (警告:LLM未返回有效的分组数据字段列表). Got: {response}", flush=True) return [{"name": field} for field in predefined_group_fields[:top_n]] # Fallback to default if LLM fails async def get_operator_candidates(template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI, top_n: int = 3) -> list[dict]: """ Gets candidate operators for a placeholder by first fetching all REGULAR scope operators and then using an LLM to select the most relevant ones. """ operators_data = get_brain_operators(scope_filters=["REGULAR"]) all_operators = operators_data.get('operators', []) if not all_operators: print("No REGULAR scope operators found. (未找到REGULAR范围的运算符)", flush=True) return [] # Create a summary of available operators for the LLM operator_names_and_descriptions = "\n".join([f"- {op['name']}: {op.get('description', 'No description available.')}" for op in all_operators]) prompt = f""" As a quantitative finance expert, you need to select the most relevant operators for an alpha template placeholder. Based on the template's logic, its explanation, and the specific placeholder, select {top_n} operators from the provided list that are most suitable. **Alpha Template:** `{template_expression}` **Template Explanation:** `{template_explanation}` **Placeholder to Fill:** `{placeholder}` **Available REGULAR Scope Operators:** {operator_names_and_descriptions} **Your Task:** Provide a list of selected operator names. Return the output as a single, valid JSON array of strings. **Example Output Format:** ["ts_mean", "ts_rank", "ts_decay"] Now, generate the JSON array of selected operators. """ print(f"--- Calling LLM to select operator candidates for placeholder (正在调用LLM选择运算符候选): {placeholder} ---", flush=True) response = await call_llm(prompt, llm_client) if isinstance(response, list) and all(isinstance(item, str) for item in response): # Filter the full list of operators to return the selected ones with their descriptions selected_ops_details = [] for selected_name in response: for op in all_operators: if op['name'] == selected_name: selected_ops_details.append({"name": op['name'], "description": op.get('description', '')}) break return selected_ops_details[:top_n] print(f"Warning: LLM did not return a valid list of strings for operator candidates (警告:LLM未返回有效的运算符候选列表). Got: {response}", flush=True) # Fallback to a default set if LLM fails return [{"name": op['name'], "description": op.get('description', '')} for op in all_operators[:top_n]] async def get_parameter_candidates(param_type: str, template_expression: str, template_explanation: str, placeholder: str, llm_client: openai.AsyncOpenAI) -> list[dict]: """ Uses an LLM to suggest sensible numerical candidates for parameters. """ param_description = "an integer value, typically a window length or count (e.g., `d` in `ts_mean(x, d)`)" if param_type == "integer_parameter" else \ "a floating-point number, typically a threshold or factor" prompt = f""" As a quantitative finance expert, you need to suggest sensible numerical candidates for a placeholder parameter. Based on the alpha template's logic, its explanation, and the placeholder's type and context, propose 3-5 diverse numerical candidates. **Alpha Template:** `{template_expression}` **Template Explanation:** `{template_explanation}` **Placeholder to Fill:** `{placeholder}` **Parameter Type:** This placeholder represents {param_description}. **Your Task:** Provide a list of numerical candidates that are appropriate for this parameter. Return the output as a single, valid JSON array of numbers. **Example Output (for integer_parameter):** [10, 20, 60, 120, 252] **Example Output (for float_parameter):** [0.01, 0.05, 0.1, 0.2, 0.5] Now, generate the JSON array of numerical candidates. """ print(f"--- Calling LLM to suggest candidates for {param_type} placeholder (正在调用LLM建议参数候选): {placeholder} ---", flush=True) response = await call_llm(prompt, llm_client) if isinstance(response, list) and all(isinstance(item, (int, float)) for item in response): return [{"value": val} for val in response] print(f"Warning: LLM did not return a valid list of numbers for {param_type} candidates (警告:LLM未返回有效的数字候选列表). Got: {response}", flush=True) # Fallback to default if LLM fails if param_type == "integer_parameter": return [{"value": x} for x in [10, 20, 60, 120, 252]] elif param_type == "float_parameter": return [{"value": x} for x in [0.01, 0.05, 0.1, 0.2, 0.5]] return [] async def judge_placeholder_type(placeholder: str, template_expression: str, template_explanation: str, operator_summary: str, llm_client: openai.AsyncOpenAI) -> str: """ Uses an LLM to judge the type of placeholder (e.g., "data_field", "integer_parameter", "group_operator"). """ prompt = f""" As a world-class quantitative finance expert, your task is to classify the type of a placeholder within an alpha expression. You will be provided with the alpha template, its explanation, the specific placeholder, and a comprehensive summary of available BRAIN operators and data field characteristics. **Alpha Template:** `{template_expression}` **Template Explanation:** `{template_explanation}` **Placeholder to Classify:** `{placeholder}` **Available BRAIN Operators and Data Field Characteristics:** {operator_summary} **Your Task:** Classify the `{placeholder}` based on the provided context. The classification should be one of the following types: - "data_field": If the placeholder clearly represents a financial data series (e.g., price, volume, fundamental ratio). - "group_data_field": If the placeholder represents a categorical field used for grouping or neutralization (e.g., `industry` in `group_zscore(x, industry)`). - "operator": If the placeholder represents a BRAIN operator that performs a calculation or transformation. - "vector_operator": If the placeholder represents a vector operator (e.g., vec_avg, vec_sum). - "integer_parameter": If the placeholder represents an integer value, typically a window length or count (e.g., `d` in `ts_mean(x, d)`). - "float_parameter": If the placeholder represents a floating-point number, typically a threshold or factor. - "string_parameter": If the placeholder represents a string value, like a group name (e.g., `industry` in `group_zscore(x, industry)`). - "unknown": If the type cannot be determined from the context. Return the classification as a single JSON object with a key "placeholder_type" and its corresponding value. Do not include any other text or formatting outside of the JSON object. **Example Output Format:** {{"placeholder_type": "data_field"}} {{"placeholder_type": "integer_parameter"}} Now, classify the placeholder. """ print(f"--- Calling LLM to judge type for placeholder (正在调用LLM判断占位符类型): {placeholder} ---", flush=True) response = await call_llm(prompt, llm_client) return response.get("placeholder_type", "unknown") async def populate_template(s: SingleSession, alpha_details: dict, template_expression: str, template_explanation: str, operator_summary: str, llm_client: openai.AsyncOpenAI, top_n_datafield: int = 50, user_region: Optional[str] = None, user_universe: Optional[str] = None, user_delay: Optional[int] = None, user_category: Optional[Union[str, list]] = None, user_data_type: str = "MATRIX") -> dict: """ Populates placeholders in an alpha template with candidate data fields, operators, or parameters. """ placeholders = extract_placeholders(template_expression) if not placeholders: print("No placeholders found in the template. (模板中未找到占位符)", flush=True) return {} """ Populates placeholders in an alpha template with candidate data fields, operators, or parameters. """ placeholders = extract_placeholders(template_expression) print(f"Found placeholders in template (在模板中找到占位符): {placeholders}", flush=True) populated_placeholders = {} for ph in placeholders: # Use LLM to judge placeholder type ph_type = await judge_placeholder_type(ph, template_expression, template_explanation, operator_summary, llm_client) print(f"'{ph}' judged as type (判断类型为): {ph_type}", flush=True) if ph_type == "data_field": candidates = await get_datafield_candidates(s, alpha_details, template_expression, template_explanation, ph, llm_client, top_n=top_n_datafield, user_region=user_region, user_universe=user_universe, user_delay=user_delay, user_category=user_category, user_data_type=user_data_type) populated_placeholders[ph] = {"type": "data_field", "candidates": candidates} elif ph_type == "group_data_field": candidates = await get_group_datafield_candidates(template_expression, template_explanation, ph, llm_client) populated_placeholders[ph] = {"type": "group_data_field", "candidates": candidates} elif ph_type in ["operator", "group_operator", "ts_operator","vector_operator"]: candidates = await get_operator_candidates(template_expression, template_explanation, ph, llm_client) populated_placeholders[ph] = {"type": ph_type, "candidates": candidates} elif ph_type in ["integer_parameter", "float_parameter"]: candidates = await get_parameter_candidates(ph_type, template_expression, template_explanation, ph, llm_client) populated_placeholders[ph] = {"type": ph_type, "candidates": candidates} elif ph_type == "string_parameter": # Add logic for string_parameter if needed, for now it returns empty populated_placeholders[ph] = {"type": "string_parameter", "candidates": []} else: print(f"Could not determine type for placeholder (无法确定占位符类型): {ph} (LLM classified as {ph_type})", flush=True) populated_placeholders[ph] = {"type": "unknown", "candidates": []} return populated_placeholders def get_datafield_prefix(datafield_name: str) -> str: """Extracts the prefix from a datafield name (e.g., 'anl44_...' -> 'anl44').""" if '_' in datafield_name: return datafield_name.split('_')[0] return datafield_name async def generate_new_alphas(alpha_description, brain_session, template_summary: Optional[str] = None, top_n_datafield: int = 50, user_region: Optional[str] = None, user_universe: Optional[str] = None, user_delay: Optional[int] = None, user_category: Optional[Union[str, list]] = None, user_data_type: str = "MATRIX"): """ Main function to generate new alpha templates based on a seed alpha. Args: alpha_description: The alpha description JSON string. brain_session: The BRAIN session object. template_summary: Optional template summary string. If None, will load from built-in. top_n_datafield: Number of data field candidates to retrieve (default: 50). user_data_type: Data type for datafield search (MATRIX or VECTOR). """ # 声明使用全局变量 global LLM_model_name, LLM_API_KEY, llm_base_url # Load template summary if not provided if template_summary is None: template_summary = load_template_summary() # --- Load Operator Summary --- operator_summary = get_brain_operators(scope_filters=["REGULAR"]) try: llm_api_key = get_token_from_auth_server() llm_base_url_value = llm_base_url # 使用全局变量 llm_client = openai.AsyncOpenAI(base_url=llm_base_url_value, api_key=llm_api_key) print("✓ LLM Gateway 认证成功", flush=True) except Exception as e: print(f"❌ LLM Gateway 认证失败: {e}", flush=True) sys.exit(1) details = json.loads(alpha_description) if not details: print(f"Failed to retrieve details for Alpha (获取Alpha详情失败)", flush=True) sys.exit(1) print("Alpha Details Retrieved (已获取Alpha详情):", flush=True) print(json.dumps(details, indent=4), flush=True) # --- Step 4: Propose New Alpha Templates --- print(f"\n{'='*60}", flush=True) print("[Step 2/5] 正在生成 Alpha 模板提议...", flush=True) print(f"{'='*60}", flush=True) proposed_templates = await propose_alpha_templates(details, template_summary, llm_client, user_data_type=user_data_type) if not proposed_templates: print("Failed to generate proposed alpha templates. (生成提议模板失败)", flush=True) sys.exit(1) print("\n--- Proposed Alpha Templates (JSON) (建议的Alpha模板,多样性会受到模型和模板总结文档的影响) ---", flush=True) print(json.dumps(proposed_templates, indent=4), flush=True) # --- Validation: Drop templates with suspicious literal identifiers --- try: operators_meta = get_brain_operators().get('operators', []) proposed_templates = _filter_valid_templates( proposed_templates, operators_meta, brain_session, details.get('settings', {}), parse_alpha_code, ) except Exception as e: print(f"⚠ 模板校验步骤出现异常,跳过校验: {e}", flush=True) if not proposed_templates: print("❌ 所有模板在校验后被丢弃,无法继续。", flush=True) sys.exit(1) # --- Step 5: Process all proposed templates and gather candidates --- # --- Step 6: Prepare for Output --- print(f"\n{'='*60}", flush=True) print("[Step 3/5] 正在处理模板并收集候选数据字段...", flush=True) print(f"{'='*60}", flush=True) # Ensure the output directory exists next to this script output_dir = Path(__file__).parent / "output" try: output_dir.mkdir(parents=True, exist_ok=True) print(f"✓ 输出目录已准备: {output_dir}", flush=True) except Exception as e: print(f"Warning: could not create directory {output_dir}: {e}", flush=True) output_filepath = output_dir / f"Alpha_candidates.json" final_output = {} # --- Step 5: Process all proposed templates and gather candidates --- total_templates = len(proposed_templates) for idx, (template_expr, template_expl) in enumerate(proposed_templates.items(), 1): print(f"\n[模板 {idx}/{total_templates}] 正在处理: '{template_expr[:60]}...'", flush=True) try: populated_info = await populate_template(brain_session, details, template_expr, template_expl, operator_summary, llm_client, top_n_datafield=top_n_datafield, user_region=user_region, user_universe=user_universe, user_delay=user_delay, user_category=user_category, user_data_type=user_data_type) # Skip templates where any data_field placeholder has zero candidates if _should_skip_due_to_empty_candidates(populated_info): print("⚠ 该模板存在数据字段候选为空的占位符,跳过此模板。", flush=True) continue final_output[template_expr] = { "template_explanation": template_expl, "seed_alpha_settings": details.get('settings', {}), "placeholder_candidates": populated_info } # --- Incremental Saving --- try: with output_filepath.open('w', encoding='utf-8') as f: json.dump(final_output, f, indent=4) print(f"✓ Progress saved to {output_filepath.name}", flush=True) except IOError as e: print(f"⚠️ Warning: Failed to save progress: {e}", flush=True) except Exception as e: print(f"❌ Error processing template '{template_expr}': {e}", flush=True) print("Skipping this template and continuing...", flush=True) continue print(f"\n{'='*60}", flush=True) print("[Step 4/5] 正在生成 Alpha 表达式组合...", flush=True) print(f"{'='*60}", flush=True) print(f"✓ 已处理 {len(final_output)} 个有效模板", flush=True) print("\n--- Final Consolidated Output (最终合并输出) ---", flush=True) print(json.dumps(final_output, indent=4), flush=True) generated_expressions = set() for template_expression, template_data in final_output.items(): placeholder_candidates = template_data["placeholder_candidates"] seed_alpha_settings = template_data["seed_alpha_settings"] # Prepare a dictionary to hold lists of candidates for each placeholder candidates_for_placeholders = {} for placeholder, details in placeholder_candidates.items(): # Extract only the 'value' or 'name' from the candidates list if details["type"] == "data_field": candidates_for_placeholders[placeholder] = [c["id"] for c in details["candidates"]] elif details["type"] in ["integer_parameter", "float_parameter"]: candidates_for_placeholders[placeholder] = [str(c["value"]) for c in details["candidates"]] elif details["type"] == "group_data_field": candidates_for_placeholders[placeholder] = [c["name"] for c in details["candidates"]] elif details["type"] == "operator": candidates_for_placeholders[placeholder] = [c["name"] for c in details["candidates"]] else: candidates_for_placeholders[placeholder] = [] # --- Step 3: Implement logic to generate all alpha expression combinations from the candidates --- # Generate all possible combinations of placeholder values placeholder_names = list(candidates_for_placeholders.keys()) all_combinations_values = list(itertools.product(*candidates_for_placeholders.values())) for combination_values in all_combinations_values: # --- ATOM Mode --- datafield_values_in_combo = [] placeholder_types = {ph: details["type"] for ph, details in placeholder_candidates.items()} for i, placeholder_name in enumerate(placeholder_names): if placeholder_types.get(placeholder_name) == 'data_field': datafield_values_in_combo.append(combination_values[i]) if len(datafield_values_in_combo) > 1: first_prefix = get_datafield_prefix(datafield_values_in_combo[0]) if not all(get_datafield_prefix(df) == first_prefix for df in datafield_values_in_combo): continue # Skip this combination as prefixes do not match current_expression = template_expression for i, placeholder_name in enumerate(placeholder_names): current_expression = current_expression.replace(placeholder_name, combination_values[i]) # Check for duplicates before adding if current_expression not in generated_expressions: generated_expressions.add(current_expression) # dump all unique generated expressions to a file, a list of strings in json file print(f"\n{'='*60}", flush=True) print("[Step 5/5] 正在验证生成的表达式...", flush=True) print(f"{'='*60}", flush=True) print(f"✓ 生成的唯一 Alpha 表达式总数: {len(generated_expressions)}", flush=True) # output_filepath = output_dir / f"Alpha_generated_expressions.json" # try: # with output_filepath.open('w', encoding='utf-8') as f: # json.dump(list(generated_expressions), f, indent=4) # print(f"\nGenerated expressions successfully written to {output_filepath} (生成的表达式已成功写入)") # except IOError as e: # print(f"Error writing generated expressions to file {output_filepath} (写入生成的表达式出错): {e}") validator = val.ExpressionValidator() print("请注意,该文件仅用于验证表达式的格式正确性,\n不保证表达式在实际使用中的逻辑正确性或可执行性。\n", flush=True) print("不在内置函数列表中的operator将无法检查,如有需要,请使用AI按需修改本源代码添加", flush=True) expressions_data = list(generated_expressions) # 提取表达式列表 # 假设JSON文件结构为 {"expressions": ["expr1", "expr2", ...]} 或直接是 ["expr1", "expr2", ...] if isinstance(expressions_data, dict) and "expressions" in expressions_data: expressions = expressions_data["expressions"] elif isinstance(expressions_data, list): expressions = expressions_data else: print("错误: JSON文件格式不正确,需要包含表达式列表", flush=True) return # 验证表达式 valid_expressions = [] invalid_expressions = [] print(f"开始验证 {len(expressions)} 个表达式...", flush=True) for i, expr in enumerate(expressions, 1): if i % 10 == 0: print(f"已验证 {i}/{len(expressions)} 个表达式", flush=True) result = validator.check_expression(expr) if result["valid"]: valid_expressions.append(expr) else: invalid_expressions.append({"expression": expr, "errors": result["errors"]}) # 生成输出文件路径 name = "Alpha_generated_expressions" valid_output_path = os.path.join(output_dir, f"{name}_success.json") invalid_output_path = os.path.join(output_dir, f"{name}_error.json") # 保存结果到JSON文件 print(f"\n验证完成!", flush=True) print(f"有效表达式: {len(valid_expressions)}", flush=True) print(f"无效表达式: {len(invalid_expressions)}", flush=True) # 保存有效表达式 try: with open(valid_output_path, 'w', encoding='utf-8') as f: json.dump(valid_expressions, f, ensure_ascii=False, indent=2) print(f"有效表达式已保存到: {valid_output_path}", flush=True) except Exception as e: print(f"错误: 保存有效表达式失败 - {e}", flush=True) # 保存无效表达式 try: with open(invalid_output_path, 'w', encoding='utf-8') as f: json.dump(invalid_expressions, f, ensure_ascii=False, indent=2) print(f"无效表达式已保存到: {invalid_output_path},文件包含错误详情", flush=True) print("查看该文件,你将获得修改模板的灵感,你可以定位到错误的模板并在APP里修改", flush=True) except Exception as e: print(f"错误: 保存无效表达式失败 - {e}", flush=True) print("请注意,该文件仅用于验证表达式的格式正确性,\n不保证表达式在实际使用中的逻辑正确性或可执行性。\n", flush=True) print("不在内置函数列表中的operator将无法检查,如有需要,请使用AI按需修改validator源代码添加", flush=True) print("不同模型效果不同,默认的kimi模型可能会产生Alpha语法错误,请检查生成的模板文件进行甄别", flush=True) print("下一步,请下载已完成的模板,放入APP首页进行解析和语法检查,强烈建议生成表达式后手动尝试回测", flush=True) async def main(): """ Main execution function. """ # Check for command line argument for config file if len(sys.argv) > 1: config_path = sys.argv[1] if os.path.exists(config_path): try: with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) print(f"✓ 已从命令行参数加载配置: {config_path}", flush=True) # Ensure all required fields are present or set defaults if 'top_n_datafield' not in config: config['top_n_datafield'] = 50 if 'template_summary_path' not in config: config['template_summary_path'] = None except Exception as e: print(f"❌ 加载配置文件失败: {e}", flush=True) sys.exit(1) else: print(f"❌ 配置文件不存在: {config_path}", flush=True) sys.exit(1) else: # --- Step 0: 交互式输入收集配置信息 --- print("\n" + "="*60, flush=True) print("交互式配置输入模式", flush=True) print("="*60 + "\n", flush=True) config = interactive_input() # 设置全局变量 global LLM_model_name, LLM_API_KEY, llm_base_url, username, password LLM_model_name = config['LLM_model_name'] LLM_API_KEY = config['LLM_API_KEY'] llm_base_url = config['llm_base_url'] username = config['username'] password = config['password'] # --- Step 1: 加载模板总结 --- template_summary = load_template_summary(config.get('template_summary_path')) # --- Step 2: 启动 BRAIN 会话 --- print("--- 正在启动 BRAIN 会话... ---", flush=True) s = start_session() # --- Step 3: 认证 LLM Gateway --- llm_client = None print("--- 正在认证 LLM Gateway... ---", flush=True) try: llm_api_key = get_token_from_auth_server() llm_base_url_value = llm_base_url llm_client = openai.AsyncOpenAI(base_url=llm_base_url_value, api_key=llm_api_key) print("✓ LLM Gateway 认证成功", flush=True) except Exception as e: print(f"❌ LLM Gateway 认证失败: {e}", flush=True) sys.exit(1) # --- Step 4: 获取 Alpha 详情 --- alpha_id = config['alpha_id'] print(f"\n--- 正在获取 Alpha ID: {alpha_id} 的详情... ---", flush=True) # --- Step 4.5: 交互式选择数据字段范围 --- if len(sys.argv) > 1: user_datafield_config = { 'user_region': config.get('user_region'), 'user_universe': config.get('user_universe'), 'user_delay': config.get('user_delay'), 'user_category': config.get('user_category'), 'user_data_type': config.get('user_data_type', 'MATRIX') } else: user_datafield_config = interactive_datafield_selection(s) details_str = await generate_alpha_description(alpha_id, brain_session=s) await generate_new_alphas( alpha_description=details_str, brain_session=s, template_summary=template_summary, top_n_datafield=config.get('top_n_datafield', 50), user_region=user_datafield_config.get('user_region'), user_universe=user_datafield_config.get('user_universe'), user_delay=user_datafield_config.get('user_delay'), user_category=user_datafield_config.get('user_category'), user_data_type=user_datafield_config.get('user_data_type', 'MATRIX') ) def interactive_datafield_selection(s: SingleSession) -> dict: """ Interactively ask the user for datafield search configuration (Region, Universe, Delay). """ print("\n" + "="*60, flush=True) print("【附加配置】数据字段搜索范围配置", flush=True) print("正在获取有效的 Region/Universe/Delay 组合...", flush=True) try: df = get_instrument_type_region_delay(s) except Exception as e: print(f"⚠ 获取配置选项失败: {e}", flush=True) print("将使用 Seed Alpha 的默认设置", flush=True) return {} # Filter for EQUITY only as per current logic df_equity = df[df['InstrumentType'] == 'EQUITY'] if df_equity.empty: print("未找到 EQUITY 类型的配置选项。", flush=True) return {} # 1. Select Region regions = df_equity['Region'].unique().tolist() print(f"\n可用地区 (Region): {regions}", flush=True) region_input = input(f"请输入地区 (直接回车使用 Seed Alpha 默认值): ").strip() selected_region = None if region_input: if region_input in regions: selected_region = region_input else: print(f"⚠ 输入无效,将使用默认值", flush=True) # 2. Select Delay # If region is selected, filter delays for that region if selected_region: delays = df_equity[df_equity['Region'] == selected_region]['Delay'].unique().tolist() else: delays = df_equity['Delay'].unique().tolist() print(f"\n可用延迟 (Delay): {delays}", flush=True) delay_input = input(f"请输入延迟 (直接回车使用 Seed Alpha 默认值): ").strip() selected_delay = None if delay_input: try: d_val = int(delay_input) if d_val in delays: selected_delay = d_val else: print(f"⚠ 输入不在列表中,将使用默认值", flush=True) except ValueError: print(f"⚠ 输入无效,将使用默认值", flush=True) # 3. Select Universe # If region and delay are selected, filter universes if selected_region and selected_delay is not None: subset = df_equity[(df_equity['Region'] == selected_region) & (df_equity['Delay'] == selected_delay)] if not subset.empty: universes = subset.iloc[0]['Universe'] else: universes = [] else: # Just show all unique universes if we can't filter precisely universes = set() for u_list in df_equity['Universe']: universes.update(u_list) universes = list(universes) print(f"\n可用范围 (Universe): {universes}", flush=True) universe_input = input(f"请输入范围 (直接回车使用 Seed Alpha 默认值): ").strip() selected_universe = None if universe_input: if universe_input in universes: selected_universe = universe_input else: print(f"⚠ 输入无效,将使用默认值", flush=True) # 4. Select Category print("\n正在获取数据类别 (Data Categories)...", flush=True) categories = get_data_categories(s) selected_category = None if categories: print("\n可用类别 (Categories):", flush=True) for i, cat in enumerate(categories): print(f"{i+1}. {cat['name']} (ID: {cat['id']})", flush=True) cat_input = input(f"请输入类别编号或ID (多个用逗号分隔, 直接回车不筛选): ").strip() if cat_input: selected_categories = [] inputs = [x.strip() for x in cat_input.split(',')] for inp in inputs: # Check if input is an index if inp.isdigit(): idx = int(inp) - 1 if 0 <= idx < len(categories): selected_categories.append(categories[idx]['id']) print(f"已选择类别: {categories[idx]['name']}", flush=True) else: # Check if input is an ID found = False for cat in categories: if cat['id'] == inp: selected_categories.append(cat['id']) print(f"已选择类别: {cat['name']}", flush=True) found = True break if not found: print(f"⚠ 输入无效: {inp}", flush=True) if selected_categories: selected_category = selected_categories else: print(f"⚠ 未选择有效类别,将不筛选类别", flush=True) else: print("⚠ 无法获取类别列表,跳过类别选择", flush=True) # 5. Select Data Type print("\n可用数据类型 (Data Type): [MATRIX, VECTOR]", flush=True) data_type_input = input(f"请输入数据类型 (直接回车默认 MATRIX): ").strip().upper() selected_data_type = "MATRIX" if data_type_input == "VECTOR": print("⚠ 警告: 请确保您输入的原型Alpha中正确地使用了vector operator,否则极容易造成数据类型错误", flush=True) confirm = input("确认使用 VECTOR 吗? (y/n): ").strip().lower() if confirm == 'y': selected_data_type = "VECTOR" else: print("已取消 VECTOR 选择,使用默认值 MATRIX", flush=True) elif data_type_input and data_type_input != "MATRIX": print(f"⚠ 输入无效,将使用默认值 MATRIX", flush=True) return { 'user_region': selected_region, 'user_universe': selected_universe, 'user_delay': selected_delay, 'user_category': selected_category, 'user_data_type': selected_data_type } if __name__ == "__main__": # To allow asyncio to run in environments like Jupyter notebooks if sys.platform.startswith('win') and sys.version_info[:2] >= (3, 8): asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) asyncio.run(main())