You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
alpha_tools/alpha-forge/scripts/05_test_extractor.py

62 lines
2.1 KiB

# -*- coding: utf-8 -*-
import re
import pandas as pd
import json
import ast # Python 的 ast.literal_eval 可以安全解析 Python 字典字符串
def test_code_001():
df = pd.read_csv("alpha_list.csv", nrows=5)
# 方法1:用 ast.literal_eval(推荐,能解析 Python 字面量)
for i, row in df.iterrows():
regular_str = row['regular']
if pd.notna(regular_str):
try:
regular_dict = ast.literal_eval(regular_str)
code = regular_dict.get('code', '')
print(f"Row {i}: {code[:80]}...")
except Exception as e:
print(f"Row {i}: 解析失败 - {e}")
# 方法2:看看原始字符串前200个字符
print("\n原始字符串示例:")
print(df.iloc[0]['regular'][:200])
def test_code_002():
expression = "last_diff_value(ts_sum(subtract(implied_volatility_call_120, implied_volatility_put_90), 20), 5)"
# 提取算子
operators = set()
for match in re.finditer(r'\b([a-z_][a-z0-9_]*)\s*\(', expression):
op = match.group(1)
operators.add(op)
print(f"匹配到算子: {op}")
print(f"\n所有算子: {operators}")
# 提取字段
candidates = set(re.findall(r'\b([a-z][a-z0-9_]*)\b', expression))
print(f"\n所有候选词: {candidates}")
# 过滤掉算子
fields = [c for c in candidates if c not in operators and not c.isdigit() and len(c) > 2]
print(f"\n字段: {fields}")
def test_code_003():
df = pd.read_csv("alpha_list.csv")
# 检查 id 列的类型
print(f"id 列类型: {df['id'].dtype}")
# 检查是否有重复
duplicates = df[df.duplicated(subset=['id'], keep=False)]
print(f"\n重复的 id 数量: {len(duplicates)}")
if len(duplicates) > 0:
print("\n重复的 id 示例:")
for aid in duplicates['id'].unique()[:5]:
rows = df[df['id'] == aid]
print(f" {aid}: 出现 {len(rows)}")
# 打印这些行的 fitness 看是否相同
for _, row in rows.iterrows():
print(f" fitness: {row['is'][:100] if isinstance(row['is'], str) else row['is']}...")