|
|
#!/usr/bin/env python3
|
|
|
"""
|
|
|
计算特征增益值(修复版 v2)
|
|
|
"""
|
|
|
|
|
|
import sqlite3
|
|
|
import pandas as pd
|
|
|
from pathlib import Path
|
|
|
|
|
|
SQLITE_PATH = Path(__file__).parent / "alpha_analysis.db"
|
|
|
|
|
|
def main():
|
|
|
print("=" * 60)
|
|
|
print("开始计算特征增益值")
|
|
|
print("=" * 60)
|
|
|
|
|
|
conn = sqlite3.connect(str(SQLITE_PATH))
|
|
|
|
|
|
# 1. 获取所有 Alpha 的 fitness(使用 id 作为主键)
|
|
|
print("\n📊 获取 Alpha 数据...")
|
|
|
df_alpha = pd.read_sql_query("""
|
|
|
SELECT id, alpha_id, fitness
|
|
|
FROM alpha_success
|
|
|
WHERE fitness IS NOT NULL
|
|
|
""", conn)
|
|
|
|
|
|
print(f" 列名: {list(df_alpha.columns)}")
|
|
|
|
|
|
global_avg = df_alpha['fitness'].mean()
|
|
|
print(f" 全局平均 fitness: {global_avg:.4f}")
|
|
|
print(f" 样本数: {len(df_alpha):,}")
|
|
|
|
|
|
# 2. 获取所有特征
|
|
|
print("\n🔧 获取特征数据...")
|
|
|
df_features = pd.read_sql_query("""
|
|
|
SELECT alpha_id, feature_type, feature_name
|
|
|
FROM alpha_feature_long
|
|
|
""", conn)
|
|
|
print(f" 总特征数: {len(df_features):,}")
|
|
|
|
|
|
# 3. 合并 Alpha 的 fitness 到特征
|
|
|
print("\n📈 计算每个特征的平均分...")
|
|
|
|
|
|
# 建立 fitness 映射(使用 id)
|
|
|
fitness_map = dict(zip(df_alpha['id'], df_alpha['fitness']))
|
|
|
df_features['fitness'] = df_features['alpha_id'].map(fitness_map)
|
|
|
|
|
|
# 过滤掉没有 fitness 的特征
|
|
|
df_features = df_features[df_features['fitness'].notna()]
|
|
|
print(f" 有效特征数: {len(df_features):,}")
|
|
|
|
|
|
# 按特征分组统计
|
|
|
stats = df_features.groupby(['feature_type', 'feature_name']).agg(
|
|
|
avg_score=('fitness', 'mean'),
|
|
|
sample_count=('fitness', 'count'),
|
|
|
total_fitness=('fitness', 'sum')
|
|
|
).reset_index()
|
|
|
|
|
|
# 计算增益值
|
|
|
stats['gain'] = stats['avg_score'] - global_avg
|
|
|
|
|
|
# 按样本数过滤(至少出现 10 次)
|
|
|
stats_filtered = stats[stats['sample_count'] >= 10].copy()
|
|
|
print(f" 过滤后特征数(样本≥10): {len(stats_filtered):,}")
|
|
|
|
|
|
# 4. 按增益值排序
|
|
|
stats_positive = stats_filtered[stats_filtered['gain'] > 0.05].sort_values('gain', ascending=False)
|
|
|
stats_negative = stats_filtered[stats_filtered['gain'] < -0.05].sort_values('gain', ascending=True)
|
|
|
|
|
|
print(f"\n✅ 统计完成:")
|
|
|
print(f" 总特征类型数: {len(stats_filtered):,}")
|
|
|
print(f" 正向特征 (gain > 0.05): {len(stats_positive):,}")
|
|
|
print(f" 负向特征 (gain < -0.05): {len(stats_negative):,}")
|
|
|
|
|
|
# 5. 输出正向特征 Top 30
|
|
|
print("\n" + "=" * 60)
|
|
|
print("📈 正向特征 Top 30 (增益值 > 0.05)")
|
|
|
print("=" * 60)
|
|
|
for _, row in stats_positive.head(30).iterrows():
|
|
|
print(f" {row['feature_type']:15} {row['feature_name']:35} 增益: {row['gain']:+.4f} (样本: {row['sample_count']:,})")
|
|
|
|
|
|
# 6. 输出负向特征 Bottom 30
|
|
|
if len(stats_negative) > 0:
|
|
|
print("\n" + "=" * 60)
|
|
|
print("📉 负向特征 Bottom 30 (增益值 < -0.05)")
|
|
|
print("=" * 60)
|
|
|
for _, row in stats_negative.head(30).iterrows():
|
|
|
print(f" {row['feature_type']:15} {row['feature_name']:35} 增益: {row['gain']:+.4f} (样本: {row['sample_count']:,})")
|
|
|
|
|
|
# 7. 保存结果到 feature_statistics 表
|
|
|
print("\n💾 保存到 feature_statistics 表...")
|
|
|
|
|
|
# 清空旧数据
|
|
|
conn.execute("DELETE FROM feature_statistics")
|
|
|
|
|
|
# 插入新数据
|
|
|
for _, row in stats_filtered.iterrows():
|
|
|
positive_effect = 0
|
|
|
if row['gain'] > 0.05:
|
|
|
positive_effect = 1
|
|
|
elif row['gain'] < -0.05:
|
|
|
positive_effect = -1
|
|
|
|
|
|
conn.execute("""
|
|
|
INSERT INTO feature_statistics
|
|
|
(feature_type, feature_name, gain_value, avg_score, global_avg_score, sample_count, positive_effect)
|
|
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
|
""", (
|
|
|
row['feature_type'],
|
|
|
row['feature_name'],
|
|
|
row['gain'],
|
|
|
row['avg_score'],
|
|
|
global_avg,
|
|
|
row['sample_count'],
|
|
|
positive_effect
|
|
|
))
|
|
|
|
|
|
conn.commit()
|
|
|
|
|
|
# 8. 更新 generation_bias 表
|
|
|
print("\n💾 更新 generation_bias 表...")
|
|
|
conn.execute("DELETE FROM generation_bias")
|
|
|
|
|
|
# 只取增益绝对值 > 0.05 且样本数 >= 10 的特征
|
|
|
bias_features = stats_filtered[abs(stats_filtered['gain']) > 0.05]
|
|
|
|
|
|
for _, row in bias_features.iterrows():
|
|
|
bias_weight = 1.0
|
|
|
if row['gain'] > 0.05:
|
|
|
bias_weight = 1.0 + min(row['gain'] * 2, 2.0)
|
|
|
elif row['gain'] < -0.05:
|
|
|
bias_weight = max(0.1, 1.0 - abs(row['gain']) * 2)
|
|
|
|
|
|
conn.execute("""
|
|
|
INSERT INTO generation_bias
|
|
|
(feature_type, feature_name, bias_weight, gain_value, sample_count, is_active)
|
|
|
VALUES (?, ?, ?, ?, ?, 1)
|
|
|
""", (
|
|
|
row['feature_type'],
|
|
|
row['feature_name'],
|
|
|
bias_weight,
|
|
|
row['gain'],
|
|
|
row['sample_count']
|
|
|
))
|
|
|
|
|
|
conn.commit()
|
|
|
|
|
|
# 9. 统计信息
|
|
|
cursor = conn.cursor()
|
|
|
cursor.execute("SELECT COUNT(*) FROM feature_statistics")
|
|
|
stats_count = cursor.fetchone()[0]
|
|
|
cursor.execute("SELECT COUNT(*) FROM generation_bias")
|
|
|
bias_count = cursor.fetchone()[0]
|
|
|
|
|
|
print(f"\n✅ 完成!")
|
|
|
print(f" feature_statistics: {stats_count:,} 条")
|
|
|
print(f" generation_bias: {bias_count:,} 条")
|
|
|
|
|
|
# 10. 显示一些关键发现
|
|
|
print("\n" + "=" * 60)
|
|
|
print("💡 关键发现")
|
|
|
print("=" * 60)
|
|
|
|
|
|
# 最佳算子
|
|
|
best_ops = stats_positive[stats_positive['feature_type'] == 'operator'].head(5)
|
|
|
if len(best_ops) > 0:
|
|
|
print("\n🏆 最佳算子 (增益最高):")
|
|
|
for _, row in best_ops.iterrows():
|
|
|
print(f" {row['feature_name']}: 增益 {row['gain']:+.4f}")
|
|
|
|
|
|
# 最佳字段
|
|
|
best_fields = stats_positive[stats_positive['feature_type'] == 'field'].head(5)
|
|
|
if len(best_fields) > 0:
|
|
|
print("\n🏆 最佳字段 (增益最高):")
|
|
|
for _, row in best_fields.iterrows():
|
|
|
print(f" {row['feature_name']}: 增益 {row['gain']:+.4f}")
|
|
|
|
|
|
# 最差算子
|
|
|
worst_ops = stats_negative[stats_negative['feature_type'] == 'operator'].tail(5)
|
|
|
if len(worst_ops) > 0:
|
|
|
print("\n⚠️ 最差算子 (增益最低):")
|
|
|
for _, row in worst_ops.iterrows():
|
|
|
print(f" {row['feature_name']}: 增益 {row['gain']:+.4f}")
|
|
|
|
|
|
conn.close()
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |