# -*- coding: utf-8 -*- import re import pandas as pd import json import ast # Python 的 ast.literal_eval 可以安全解析 Python 字典字符串 def test_code_001(): df = pd.read_csv("alpha_list.csv", nrows=5) # 方法1:用 ast.literal_eval(推荐,能解析 Python 字面量) for i, row in df.iterrows(): regular_str = row['regular'] if pd.notna(regular_str): try: regular_dict = ast.literal_eval(regular_str) code = regular_dict.get('code', '') print(f"Row {i}: {code[:80]}...") except Exception as e: print(f"Row {i}: 解析失败 - {e}") # 方法2:看看原始字符串前200个字符 print("\n原始字符串示例:") print(df.iloc[0]['regular'][:200]) def test_code_002(): expression = "last_diff_value(ts_sum(subtract(implied_volatility_call_120, implied_volatility_put_90), 20), 5)" # 提取算子 operators = set() for match in re.finditer(r'\b([a-z_][a-z0-9_]*)\s*\(', expression): op = match.group(1) operators.add(op) print(f"匹配到算子: {op}") print(f"\n所有算子: {operators}") # 提取字段 candidates = set(re.findall(r'\b([a-z][a-z0-9_]*)\b', expression)) print(f"\n所有候选词: {candidates}") # 过滤掉算子 fields = [c for c in candidates if c not in operators and not c.isdigit() and len(c) > 2] print(f"\n字段: {fields}") def test_code_003(): df = pd.read_csv("alpha_list.csv") # 检查 id 列的类型 print(f"id 列类型: {df['id'].dtype}") # 检查是否有重复 duplicates = df[df.duplicated(subset=['id'], keep=False)] print(f"\n重复的 id 数量: {len(duplicates)}") if len(duplicates) > 0: print("\n重复的 id 示例:") for aid in duplicates['id'].unique()[:5]: rows = df[df['id'] == aid] print(f" {aid}: 出现 {len(rows)} 次") # 打印这些行的 fitness 看是否相同 for _, row in rows.iterrows(): print(f" fitness: {row['is'][:100] if isinstance(row['is'], str) else row['is']}...")