You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
62 lines
2.1 KiB
62 lines
2.1 KiB
# -*- coding: utf-8 -*-
|
|
import re
|
|
import pandas as pd
|
|
import json
|
|
import ast # Python 的 ast.literal_eval 可以安全解析 Python 字典字符串
|
|
|
|
def test_code_001():
|
|
df = pd.read_csv("alpha_list.csv", nrows=5)
|
|
|
|
# 方法1:用 ast.literal_eval(推荐,能解析 Python 字面量)
|
|
for i, row in df.iterrows():
|
|
regular_str = row['regular']
|
|
if pd.notna(regular_str):
|
|
try:
|
|
regular_dict = ast.literal_eval(regular_str)
|
|
code = regular_dict.get('code', '')
|
|
print(f"Row {i}: {code[:80]}...")
|
|
except Exception as e:
|
|
print(f"Row {i}: 解析失败 - {e}")
|
|
|
|
# 方法2:看看原始字符串前200个字符
|
|
print("\n原始字符串示例:")
|
|
print(df.iloc[0]['regular'][:200])
|
|
|
|
def test_code_002():
|
|
expression = "last_diff_value(ts_sum(subtract(implied_volatility_call_120, implied_volatility_put_90), 20), 5)"
|
|
|
|
# 提取算子
|
|
operators = set()
|
|
for match in re.finditer(r'\b([a-z_][a-z0-9_]*)\s*\(', expression):
|
|
op = match.group(1)
|
|
operators.add(op)
|
|
print(f"匹配到算子: {op}")
|
|
|
|
print(f"\n所有算子: {operators}")
|
|
|
|
# 提取字段
|
|
candidates = set(re.findall(r'\b([a-z][a-z0-9_]*)\b', expression))
|
|
print(f"\n所有候选词: {candidates}")
|
|
|
|
# 过滤掉算子
|
|
fields = [c for c in candidates if c not in operators and not c.isdigit() and len(c) > 2]
|
|
print(f"\n字段: {fields}")
|
|
|
|
def test_code_003():
|
|
df = pd.read_csv("alpha_list.csv")
|
|
|
|
# 检查 id 列的类型
|
|
print(f"id 列类型: {df['id'].dtype}")
|
|
|
|
# 检查是否有重复
|
|
duplicates = df[df.duplicated(subset=['id'], keep=False)]
|
|
print(f"\n重复的 id 数量: {len(duplicates)}")
|
|
|
|
if len(duplicates) > 0:
|
|
print("\n重复的 id 示例:")
|
|
for aid in duplicates['id'].unique()[:5]:
|
|
rows = df[df['id'] == aid]
|
|
print(f" {aid}: 出现 {len(rows)} 次")
|
|
# 打印这些行的 fitness 看是否相同
|
|
for _, row in rows.iterrows():
|
|
print(f" fitness: {row['is'][:100] if isinstance(row['is'], str) else row['is']}...") |