alpha_tools/alpha-forge/scripts/04_calc_gain.py

#!/usr/bin/env python3
"""
计算特征增益值（修复版 v2）
"""

import sqlite3
import pandas as pd
from pathlib import Path

SQLITE_PATH = Path(__file__).parent / "alpha_analysis.db"

def main():
    print("=" * 60)
    print("开始计算特征增益值")
    print("=" * 60)

    conn = sqlite3.connect(str(SQLITE_PATH))

    # 1. 获取所有 Alpha 的 fitness（使用 id 作为主键）
    print("\n📊 获取 Alpha 数据...")
    df_alpha = pd.read_sql_query("""
        SELECT id, alpha_id, fitness
        FROM alpha_success
        WHERE fitness IS NOT NULL
    """, conn)

    print(f"   列名: {list(df_alpha.columns)}")

    global_avg = df_alpha['fitness'].mean()
    print(f"   全局平均 fitness: {global_avg:.4f}")
    print(f"   样本数: {len(df_alpha):,}")

    # 2. 获取所有特征
    print("\n🔧 获取特征数据...")
    df_features = pd.read_sql_query("""
        SELECT alpha_id, feature_type, feature_name
        FROM alpha_feature_long
    """, conn)
    print(f"   总特征数: {len(df_features):,}")

    # 3. 合并 Alpha 的 fitness 到特征
    print("\n📈 计算每个特征的平均分...")

    # 建立 fitness 映射（使用 id）
    fitness_map = dict(zip(df_alpha['id'], df_alpha['fitness']))
    df_features['fitness'] = df_features['alpha_id'].map(fitness_map)

    # 过滤掉没有 fitness 的特征
    df_features = df_features[df_features['fitness'].notna()]
    print(f"   有效特征数: {len(df_features):,}")

    # 按特征分组统计
    stats = df_features.groupby(['feature_type', 'feature_name']).agg(
        avg_score=('fitness', 'mean'),
        sample_count=('fitness', 'count'),
        total_fitness=('fitness', 'sum')
    ).reset_index()

    # 计算增益值
    stats['gain'] = stats['avg_score'] - global_avg

    # 按样本数过滤（至少出现 10 次）
    stats_filtered = stats[stats['sample_count'] >= 10].copy()
    print(f"   过滤后特征数（样本≥10）: {len(stats_filtered):,}")

    # 4. 按增益值排序
    stats_positive = stats_filtered[stats_filtered['gain'] > 0.05].sort_values('gain', ascending=False)
    stats_negative = stats_filtered[stats_filtered['gain'] < -0.05].sort_values('gain', ascending=True)

    print(f"\n✅ 统计完成:")
    print(f"   总特征类型数: {len(stats_filtered):,}")
    print(f"   正向特征 (gain > 0.05): {len(stats_positive):,}")
    print(f"   负向特征 (gain < -0.05): {len(stats_negative):,}")

    # 5. 输出正向特征 Top 30
    print("\n" + "=" * 60)
    print("📈 正向特征 Top 30 (增益值 > 0.05)")
    print("=" * 60)
    for _, row in stats_positive.head(30).iterrows():
        print(f"   {row['feature_type']:15} {row['feature_name']:35} 增益: {row['gain']:+.4f}  (样本: {row['sample_count']:,})")

    # 6. 输出负向特征 Bottom 30
    if len(stats_negative) > 0:
        print("\n" + "=" * 60)
        print("📉 负向特征 Bottom 30 (增益值 < -0.05)")
        print("=" * 60)
        for _, row in stats_negative.head(30).iterrows():
            print(f"   {row['feature_type']:15} {row['feature_name']:35} 增益: {row['gain']:+.4f}  (样本: {row['sample_count']:,})")

    # 7. 保存结果到 feature_statistics 表
    print("\n💾 保存到 feature_statistics 表...")

    # 清空旧数据
    conn.execute("DELETE FROM feature_statistics")

    # 插入新数据
    for _, row in stats_filtered.iterrows():
        positive_effect = 0
        if row['gain'] > 0.05:
            positive_effect = 1
        elif row['gain'] < -0.05:
            positive_effect = -1

        conn.execute("""
            INSERT INTO feature_statistics
            (feature_type, feature_name, gain_value, avg_score, global_avg_score, sample_count, positive_effect)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        """, (
            row['feature_type'],
            row['feature_name'],
            row['gain'],
            row['avg_score'],
            global_avg,
            row['sample_count'],
            positive_effect
        ))

    conn.commit()

    # 8. 更新 generation_bias 表
    print("\n💾 更新 generation_bias 表...")
    conn.execute("DELETE FROM generation_bias")

    # 只取增益绝对值 > 0.05 且样本数 >= 10 的特征
    bias_features = stats_filtered[abs(stats_filtered['gain']) > 0.05]

    for _, row in bias_features.iterrows():
        bias_weight = 1.0
        if row['gain'] > 0.05:
            bias_weight = 1.0 + min(row['gain'] * 2, 2.0)
        elif row['gain'] < -0.05:
            bias_weight = max(0.1, 1.0 - abs(row['gain']) * 2)

        conn.execute("""
            INSERT INTO generation_bias
            (feature_type, feature_name, bias_weight, gain_value, sample_count, is_active)
            VALUES (?, ?, ?, ?, ?, 1)
        """, (
            row['feature_type'],
            row['feature_name'],
            bias_weight,
            row['gain'],
            row['sample_count']
        ))

    conn.commit()

    # 9. 统计信息
    cursor = conn.cursor()
    cursor.execute("SELECT COUNT(*) FROM feature_statistics")
    stats_count = cursor.fetchone()[0]
    cursor.execute("SELECT COUNT(*) FROM generation_bias")
    bias_count = cursor.fetchone()[0]

    print(f"\n✅ 完成!")
    print(f"   feature_statistics: {stats_count:,} 条")
    print(f"   generation_bias: {bias_count:,} 条")

    # 10. 显示一些关键发现
    print("\n" + "=" * 60)
    print("💡 关键发现")
    print("=" * 60)

    # 最佳算子
    best_ops = stats_positive[stats_positive['feature_type'] == 'operator'].head(5)
    if len(best_ops) > 0:
        print("\n🏆 最佳算子 (增益最高):")
        for _, row in best_ops.iterrows():
            print(f"   {row['feature_name']}: 增益 {row['gain']:+.4f}")

    # 最佳字段
    best_fields = stats_positive[stats_positive['feature_type'] == 'field'].head(5)
    if len(best_fields) > 0:
        print("\n🏆 最佳字段 (增益最高):")
        for _, row in best_fields.iterrows():
            print(f"   {row['feature_name']}: 增益 {row['gain']:+.4f}")

    # 最差算子
    worst_ops = stats_negative[stats_negative['feature_type'] == 'operator'].tail(5)
    if len(worst_ops) > 0:
        print("\n⚠️  最差算子 (增益最低):")
        for _, row in worst_ops.iterrows():
            print(f"   {row['feature_name']}: 增益 {row['gain']:+.4f}")

    conn.close()


if __name__ == "__main__":
    main()