|
|
#!/usr/bin/env python3
|
|
|
"""
|
|
|
Alpha 特征分析数据库 - 长表方案
|
|
|
适用场景:95,003 条表达式,6,488 个字段,84 个算子
|
|
|
"""
|
|
|
|
|
|
import sqlite3
|
|
|
import os
|
|
|
import re
|
|
|
from pathlib import Path
|
|
|
|
|
|
DB_PATH = Path(__file__).parent / "alpha_analysis.db"
|
|
|
|
|
|
# 84 个算子完整列表
|
|
|
OPERATORS = [
|
|
|
# Arithmetic (16)
|
|
|
'add', 'abs', 'log', 'subtract', 'signed_power', 'sign', 'reverse', 'power',
|
|
|
'multiply', 'min', 'max', 'inverse', 'sqrt', 's_log_1p', 'densify', 'divide',
|
|
|
# Logical (12)
|
|
|
'not', 'and', 'less', 'equal', 'or', 'not_equal', 'greater', 'greater_equal',
|
|
|
'less_equal', 'is_nan', 'if_else',
|
|
|
# Time Series (29)
|
|
|
'ts_sum', 'ts_scale', 'ts_mean', 'ts_zscore', 'ts_std_dev', 'kth_element',
|
|
|
'inst_tvr', 'ts_corr', 'ts_count_nans', 'ts_target_tvr_decay', 'ts_median',
|
|
|
'ts_covariance', 'ts_decay_linear', 'ts_product', 'ts_regression', 'ts_delta_limit',
|
|
|
'ts_step', 'ts_decay_exp_window', 'ts_quantile', 'days_from_last_change', 'hump',
|
|
|
'last_diff_value', 'ts_arg_max', 'ts_arg_min', 'ts_av_diff', 'ts_backfill',
|
|
|
'ts_rank', 'ts_delay', 'ts_delta',
|
|
|
# Cross Sectional (8)
|
|
|
'winsorize', 'truncate', 'regression_neut', 'scale', 'rank', 'quantile',
|
|
|
'normalize', 'zscore',
|
|
|
# Vector (7)
|
|
|
'vec_min', 'vec_count', 'vec_stddev', 'vec_range', 'vec_avg', 'vec_sum', 'vec_max',
|
|
|
# Transformational (4)
|
|
|
'left_tail', 'trade_when', 'right_tail', 'bucket',
|
|
|
# Group (8)
|
|
|
'group_rank', 'group_cartesian_product', 'group_backfill', 'group_mean',
|
|
|
'group_neutralize', 'group_normalize', 'group_median', 'group_scale', 'group_zscore'
|
|
|
]
|
|
|
|
|
|
# 常用窗口参数
|
|
|
WINDOWS = [1, 2, 5, 10, 20, 30, 60, 90, 252]
|
|
|
|
|
|
# 中性化方式
|
|
|
NEUTRALIZATIONS = ['FAST', 'SUBINDUSTRY', 'INDUSTRY', 'NONE']
|
|
|
|
|
|
|
|
|
def get_connection():
|
|
|
"""获取数据库连接"""
|
|
|
conn = sqlite3.connect(str(DB_PATH))
|
|
|
conn.row_factory = sqlite3.Row
|
|
|
return conn
|
|
|
|
|
|
|
|
|
def create_tables(conn):
|
|
|
"""创建所有表"""
|
|
|
cursor = conn.cursor()
|
|
|
|
|
|
# =========================================================
|
|
|
# 表1: alpha_success(成功样本 - 主表)
|
|
|
# =========================================================
|
|
|
cursor.execute("""
|
|
|
CREATE TABLE IF NOT EXISTS alpha_success (
|
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
alpha_id VARCHAR(64) UNIQUE,
|
|
|
expression TEXT NOT NULL,
|
|
|
sharpe REAL,
|
|
|
fitness REAL,
|
|
|
returns REAL,
|
|
|
drawdown REAL,
|
|
|
turnover REAL,
|
|
|
long_count INTEGER,
|
|
|
short_count INTEGER,
|
|
|
neutralization VARCHAR(32),
|
|
|
universe VARCHAR(32),
|
|
|
dataset VARCHAR(32),
|
|
|
score REAL,
|
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
|
)
|
|
|
""")
|
|
|
|
|
|
# 索引
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_success_score ON alpha_success(score)")
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_success_sharpe ON alpha_success(sharpe)")
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_success_neutralization ON alpha_success(neutralization)")
|
|
|
|
|
|
# =========================================================
|
|
|
# 表2: alpha_feature_long(特征长表 - 核心)
|
|
|
# 每个特征一行:算子、字段、窗口、配置
|
|
|
# =========================================================
|
|
|
cursor.execute("""
|
|
|
CREATE TABLE IF NOT EXISTS alpha_feature_long (
|
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
alpha_id INTEGER NOT NULL,
|
|
|
feature_type VARCHAR(32) NOT NULL, -- operator, field, window, neutralization
|
|
|
feature_name VARCHAR(128) NOT NULL,
|
|
|
feature_value INTEGER DEFAULT 1, -- 1=存在,窗口值存具体数字
|
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
FOREIGN KEY (alpha_id) REFERENCES alpha_success(id) ON DELETE CASCADE
|
|
|
)
|
|
|
""")
|
|
|
|
|
|
# 索引(加速统计查询)
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_feature_type_name ON alpha_feature_long(feature_type, feature_name)")
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_feature_alpha_id ON alpha_feature_long(alpha_id)")
|
|
|
|
|
|
# =========================================================
|
|
|
# 表3: feature_statistics(物化统计表 - 反馈核心)
|
|
|
# 存储每个特征的增益值,供生成器使用
|
|
|
# =========================================================
|
|
|
cursor.execute("""
|
|
|
CREATE TABLE IF NOT EXISTS feature_statistics (
|
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
feature_type VARCHAR(32) NOT NULL,
|
|
|
feature_name VARCHAR(128) NOT NULL,
|
|
|
gain_value REAL, -- 增益值 = 特征平均分 - 全局平均分
|
|
|
avg_score REAL, -- 包含该特征的表达式平均分
|
|
|
global_avg_score REAL, -- 统计时的全局平均分
|
|
|
sample_count INTEGER, -- 包含该特征的表达式数量
|
|
|
positive_effect INTEGER DEFAULT 0, -- 1=正向特征(gain>0.05), -1=负向特征(gain<-0.05)
|
|
|
is_active INTEGER DEFAULT 1, -- 是否启用反馈
|
|
|
calculated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
UNIQUE(feature_type, feature_name)
|
|
|
)
|
|
|
""")
|
|
|
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_stats_positive ON feature_statistics(positive_effect)")
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_stats_gain ON feature_statistics(gain_value DESC)")
|
|
|
|
|
|
# =========================================================
|
|
|
# 表4: alpha_failure(失败样本 - 不兼容组合)
|
|
|
# =========================================================
|
|
|
cursor.execute("""
|
|
|
CREATE TABLE IF NOT EXISTS alpha_failure (
|
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
operator VARCHAR(64) NOT NULL,
|
|
|
field VARCHAR(64) NOT NULL,
|
|
|
expression TEXT,
|
|
|
error_type VARCHAR(32),
|
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
|
)
|
|
|
""")
|
|
|
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_failure_combo ON alpha_failure(operator, field)")
|
|
|
|
|
|
# =========================================================
|
|
|
# 表5: operator_field_blacklist(黑名单 - 从不兼容统计生成)
|
|
|
# =========================================================
|
|
|
cursor.execute("""
|
|
|
CREATE TABLE IF NOT EXISTS operator_field_blacklist (
|
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
operator VARCHAR(64) NOT NULL,
|
|
|
field VARCHAR(64) NOT NULL,
|
|
|
fail_count INTEGER DEFAULT 0,
|
|
|
total_attempts INTEGER DEFAULT 0,
|
|
|
fail_rate REAL DEFAULT 0.0,
|
|
|
is_active INTEGER DEFAULT 1,
|
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
UNIQUE(operator, field)
|
|
|
)
|
|
|
""")
|
|
|
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_blacklist_active ON operator_field_blacklist(is_active)")
|
|
|
|
|
|
# =========================================================
|
|
|
# 表6: generation_bias(生成偏向配置 - 反馈到生成器)
|
|
|
# =========================================================
|
|
|
cursor.execute("""
|
|
|
CREATE TABLE IF NOT EXISTS generation_bias (
|
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
feature_type VARCHAR(32) NOT NULL,
|
|
|
feature_name VARCHAR(128) NOT NULL,
|
|
|
bias_weight REAL DEFAULT 1.0, -- >1 正向,<1 负向,1 中性
|
|
|
gain_value REAL,
|
|
|
sample_count INTEGER,
|
|
|
is_active INTEGER DEFAULT 1,
|
|
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
|
UNIQUE(feature_type, feature_name)
|
|
|
)
|
|
|
""")
|
|
|
|
|
|
cursor.execute("CREATE INDEX IF NOT EXISTS idx_bias_weight ON generation_bias(bias_weight DESC)")
|
|
|
|
|
|
conn.commit()
|
|
|
print("✅ 所有表创建成功")
|
|
|
print(f"📁 数据库路径: {DB_PATH}")
|
|
|
print(f"\n📊 表结构:")
|
|
|
print(f" - alpha_success: 主表,存储表达式和回测结果")
|
|
|
print(f" - alpha_feature_long: 长表,每个特征一行")
|
|
|
print(f" - feature_statistics: 物化统计,特征增益值")
|
|
|
print(f" - alpha_failure: 失败样本记录")
|
|
|
print(f" - operator_field_blacklist: 不兼容黑名单")
|
|
|
print(f" - generation_bias: 生成偏向配置")
|
|
|
|
|
|
|
|
|
def show_table_info(conn):
|
|
|
"""显示各表记录数"""
|
|
|
cursor = conn.cursor()
|
|
|
tables = ['alpha_success', 'alpha_feature_long', 'feature_statistics',
|
|
|
'alpha_failure', 'operator_field_blacklist', 'generation_bias']
|
|
|
|
|
|
print("\n📋 当前表记录数:")
|
|
|
for table in tables:
|
|
|
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
|
|
count = cursor.fetchone()[0]
|
|
|
print(f" - {table}: {count:,}")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
# 检查数据库是否存在
|
|
|
if DB_PATH.exists():
|
|
|
print(f"⚠️ 数据库已存在: {DB_PATH}")
|
|
|
response = input("是否删除并重建?(y/N): ").strip().lower()
|
|
|
if response == 'y':
|
|
|
os.remove(DB_PATH)
|
|
|
print("🗑️ 已删除旧数据库")
|
|
|
else:
|
|
|
print("❌ 已取消操作")
|
|
|
exit(0)
|
|
|
|
|
|
# 创建数据库和表
|
|
|
conn = get_connection()
|
|
|
try:
|
|
|
create_tables(conn)
|
|
|
show_table_info(conn)
|
|
|
finally:
|
|
|
conn.close()
|
|
|
|
|
|
print("\n✅ 初始化完成!下一步:导入数据并提取特征") |