#!/usr/bin/env python3 """ Alpha 特征分析数据库 - 长表方案 适用场景:95,003 条表达式,6,488 个字段,84 个算子 """ import sqlite3 import os import re from pathlib import Path DB_PATH = Path(__file__).parent / "alpha_analysis.db" # 84 个算子完整列表 OPERATORS = [ # Arithmetic (16) 'add', 'abs', 'log', 'subtract', 'signed_power', 'sign', 'reverse', 'power', 'multiply', 'min', 'max', 'inverse', 'sqrt', 's_log_1p', 'densify', 'divide', # Logical (12) 'not', 'and', 'less', 'equal', 'or', 'not_equal', 'greater', 'greater_equal', 'less_equal', 'is_nan', 'if_else', # Time Series (29) 'ts_sum', 'ts_scale', 'ts_mean', 'ts_zscore', 'ts_std_dev', 'kth_element', 'inst_tvr', 'ts_corr', 'ts_count_nans', 'ts_target_tvr_decay', 'ts_median', 'ts_covariance', 'ts_decay_linear', 'ts_product', 'ts_regression', 'ts_delta_limit', 'ts_step', 'ts_decay_exp_window', 'ts_quantile', 'days_from_last_change', 'hump', 'last_diff_value', 'ts_arg_max', 'ts_arg_min', 'ts_av_diff', 'ts_backfill', 'ts_rank', 'ts_delay', 'ts_delta', # Cross Sectional (8) 'winsorize', 'truncate', 'regression_neut', 'scale', 'rank', 'quantile', 'normalize', 'zscore', # Vector (7) 'vec_min', 'vec_count', 'vec_stddev', 'vec_range', 'vec_avg', 'vec_sum', 'vec_max', # Transformational (4) 'left_tail', 'trade_when', 'right_tail', 'bucket', # Group (8) 'group_rank', 'group_cartesian_product', 'group_backfill', 'group_mean', 'group_neutralize', 'group_normalize', 'group_median', 'group_scale', 'group_zscore' ] # 常用窗口参数 WINDOWS = [1, 2, 5, 10, 20, 30, 60, 90, 252] # 中性化方式 NEUTRALIZATIONS = ['FAST', 'SUBINDUSTRY', 'INDUSTRY', 'NONE'] def get_connection(): """获取数据库连接""" conn = sqlite3.connect(str(DB_PATH)) conn.row_factory = sqlite3.Row return conn def create_tables(conn): """创建所有表""" cursor = conn.cursor() # ========================================================= # 表1: alpha_success(成功样本 - 主表) # ========================================================= cursor.execute(""" CREATE TABLE IF NOT EXISTS alpha_success ( id INTEGER PRIMARY KEY AUTOINCREMENT, alpha_id VARCHAR(64) UNIQUE, expression TEXT NOT NULL, sharpe REAL, fitness REAL, returns REAL, drawdown REAL, turnover REAL, long_count INTEGER, short_count INTEGER, neutralization VARCHAR(32), universe VARCHAR(32), dataset VARCHAR(32), score REAL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) # 索引 cursor.execute("CREATE INDEX IF NOT EXISTS idx_success_score ON alpha_success(score)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_success_sharpe ON alpha_success(sharpe)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_success_neutralization ON alpha_success(neutralization)") # ========================================================= # 表2: alpha_feature_long(特征长表 - 核心) # 每个特征一行:算子、字段、窗口、配置 # ========================================================= cursor.execute(""" CREATE TABLE IF NOT EXISTS alpha_feature_long ( id INTEGER PRIMARY KEY AUTOINCREMENT, alpha_id INTEGER NOT NULL, feature_type VARCHAR(32) NOT NULL, -- operator, field, window, neutralization feature_name VARCHAR(128) NOT NULL, feature_value INTEGER DEFAULT 1, -- 1=存在,窗口值存具体数字 created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (alpha_id) REFERENCES alpha_success(id) ON DELETE CASCADE ) """) # 索引(加速统计查询) cursor.execute("CREATE INDEX IF NOT EXISTS idx_feature_type_name ON alpha_feature_long(feature_type, feature_name)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_feature_alpha_id ON alpha_feature_long(alpha_id)") # ========================================================= # 表3: feature_statistics(物化统计表 - 反馈核心) # 存储每个特征的增益值,供生成器使用 # ========================================================= cursor.execute(""" CREATE TABLE IF NOT EXISTS feature_statistics ( id INTEGER PRIMARY KEY AUTOINCREMENT, feature_type VARCHAR(32) NOT NULL, feature_name VARCHAR(128) NOT NULL, gain_value REAL, -- 增益值 = 特征平均分 - 全局平均分 avg_score REAL, -- 包含该特征的表达式平均分 global_avg_score REAL, -- 统计时的全局平均分 sample_count INTEGER, -- 包含该特征的表达式数量 positive_effect INTEGER DEFAULT 0, -- 1=正向特征(gain>0.05), -1=负向特征(gain<-0.05) is_active INTEGER DEFAULT 1, -- 是否启用反馈 calculated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, UNIQUE(feature_type, feature_name) ) """) cursor.execute("CREATE INDEX IF NOT EXISTS idx_stats_positive ON feature_statistics(positive_effect)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_stats_gain ON feature_statistics(gain_value DESC)") # ========================================================= # 表4: alpha_failure(失败样本 - 不兼容组合) # ========================================================= cursor.execute(""" CREATE TABLE IF NOT EXISTS alpha_failure ( id INTEGER PRIMARY KEY AUTOINCREMENT, operator VARCHAR(64) NOT NULL, field VARCHAR(64) NOT NULL, expression TEXT, error_type VARCHAR(32), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) cursor.execute("CREATE INDEX IF NOT EXISTS idx_failure_combo ON alpha_failure(operator, field)") # ========================================================= # 表5: operator_field_blacklist(黑名单 - 从不兼容统计生成) # ========================================================= cursor.execute(""" CREATE TABLE IF NOT EXISTS operator_field_blacklist ( id INTEGER PRIMARY KEY AUTOINCREMENT, operator VARCHAR(64) NOT NULL, field VARCHAR(64) NOT NULL, fail_count INTEGER DEFAULT 0, total_attempts INTEGER DEFAULT 0, fail_rate REAL DEFAULT 0.0, is_active INTEGER DEFAULT 1, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, UNIQUE(operator, field) ) """) cursor.execute("CREATE INDEX IF NOT EXISTS idx_blacklist_active ON operator_field_blacklist(is_active)") # ========================================================= # 表6: generation_bias(生成偏向配置 - 反馈到生成器) # ========================================================= cursor.execute(""" CREATE TABLE IF NOT EXISTS generation_bias ( id INTEGER PRIMARY KEY AUTOINCREMENT, feature_type VARCHAR(32) NOT NULL, feature_name VARCHAR(128) NOT NULL, bias_weight REAL DEFAULT 1.0, -- >1 正向,<1 负向,1 中性 gain_value REAL, sample_count INTEGER, is_active INTEGER DEFAULT 1, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, UNIQUE(feature_type, feature_name) ) """) cursor.execute("CREATE INDEX IF NOT EXISTS idx_bias_weight ON generation_bias(bias_weight DESC)") conn.commit() print("✅ 所有表创建成功") print(f"📁 数据库路径: {DB_PATH}") print(f"\n📊 表结构:") print(f" - alpha_success: 主表,存储表达式和回测结果") print(f" - alpha_feature_long: 长表,每个特征一行") print(f" - feature_statistics: 物化统计,特征增益值") print(f" - alpha_failure: 失败样本记录") print(f" - operator_field_blacklist: 不兼容黑名单") print(f" - generation_bias: 生成偏向配置") def show_table_info(conn): """显示各表记录数""" cursor = conn.cursor() tables = ['alpha_success', 'alpha_feature_long', 'feature_statistics', 'alpha_failure', 'operator_field_blacklist', 'generation_bias'] print("\n📋 当前表记录数:") for table in tables: cursor.execute(f"SELECT COUNT(*) FROM {table}") count = cursor.fetchone()[0] print(f" - {table}: {count:,}") if __name__ == "__main__": # 检查数据库是否存在 if DB_PATH.exists(): print(f"⚠️ 数据库已存在: {DB_PATH}") response = input("是否删除并重建?(y/N): ").strip().lower() if response == 'y': os.remove(DB_PATH) print("🗑️ 已删除旧数据库") else: print("❌ 已取消操作") exit(0) # 创建数据库和表 conn = get_connection() try: create_tables(conn) show_table_info(conn) finally: conn.close() print("\n✅ 初始化完成!下一步:导入数据并提取特征")