You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 
alpha_tools/wqb-get-alphas/simple_alpha_fetcher.py

264 lines
9.2 KiB

import httpx
from httpx import BasicAuth, Timeout
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import logging
import time
import random
class SimpleAlphaFetcher:
def __init__(self, base_path=None):
"""
初始化 Alpha 获取器
Args:
base_path: 输出文件保存路径
"""
self.client = None
self.base_path = Path(base_path) if base_path else Path.cwd()
self.logger = self._setup_logger()
# 登录
self.login()
def _setup_logger(self):
"""设置日志记录器"""
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.handlers:
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
def login(self):
"""登录 WorldQuant Brain API"""
try:
# 从 nacos 获取账号密码
with httpx.Client(timeout=10.0) as temp_client:
nacos_resp = temp_client.get(
'http://192.168.31.41:30848/nacos/v1/cs/configs?dataId=wq_account&group=quantify'
)
if nacos_resp.status_code != 200:
self.logger.error('获取账号密码失败')
return False
config = nacos_resp.json()
username = config.get('user_name')
password = config.get('password')
if not username or not password:
self.logger.error('账号密码不完整')
return False
self.logger.info(f"正在登录账户: {username}")
# 创建客户端并设置超时
timeout = Timeout(connect=30.0, read=60.0, write=30.0, pool=30.0)
self.client = httpx.Client(
auth=BasicAuth(username, password),
timeout=timeout
)
# 发送登录请求
response = self.client.post('https://api.worldquantbrain.com/authentication')
if response.status_code == 201:
self.logger.info("登录成功!")
return True
else:
self.logger.error(f"登录失败: {response.status_code} - {response.text}")
self.client.close()
self.client = None
return False
except Exception as e:
self.logger.error(f"登录异常: {e}")
return False
def fetch_alphas(
self,
max_pages: int = 100,
limit: int = 100,
delay: int = 1,
region: str = "USA",
universe: str = "TOP3000",
hidden: str = "false",
output_file_name: str = "alpha_list.csv",
mode: str = "w",
max_retries: int = 3,
):
"""
获取 Alpha 列表
Args:
max_pages: 最大爬取页数
limit: 每页数量
delay: API 请求的延迟设置
region: 市场区域
universe: 股票池
hidden: 是否搜索已隐藏的 Alpha
output_file_name: 输出的 CSV 文件名
mode: 写入模式:"w"(覆盖写入) 或 "a"(追加写入)
max_retries: 请求失败时的最大重试次数
Returns:
list: Alpha 列表
"""
if not self.client:
self.logger.error("客户端未登录,无法执行获取")
return []
fetched_alphas = []
offset = 0
total_accessed = 0
pages_fetched = 0
# 先获取总数
count_url = (
f"https://api.worldquantbrain.com/users/self/alphas?stage=IS&hidden={hidden}"
f"&limit=1&settings.delay={delay}&settings.region={region}&status=UNSUBMITTED%1FIS_FAIL&settings.universe={universe}"
)
total_available = 0
for attempt in range(max_retries):
try:
count_response = self.client.get(count_url)
total_available = count_response.json()["count"]
break
except Exception as e:
self.logger.warning(f"获取 Alpha 总数失败 (尝试 {attempt+1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(random.uniform(3, 5))
else:
self.logger.error("获取 Alpha 总数最终失败")
return []
if total_available == 0:
self.logger.warning("未找到任何 Alpha")
return []
# 计算实际最大页数
actual_max_pages = min(max_pages, (total_available + limit - 1) // limit)
self.logger.info(f"共找到 {total_available} 个 Alpha,计划爬取 {actual_max_pages} 页...")
pbar = tqdm(total=actual_max_pages, desc="爬取 Alpha 页面", unit="")
while pages_fetched < actual_max_pages:
# 构建请求 URL
url = (
f"https://api.worldquantbrain.com/users/self/alphas?stage=IS&limit={limit}"
f"&offset={offset}&settings.delay={delay}&settings.region={region}&hidden={hidden}&status=UNSUBMITTED%1FIS_FAIL&settings.universe={universe}"
)
try:
# 使用重试机制
response = None
for attempt in range(max_retries):
try:
response = self.client.get(url)
break
except Exception as e:
self.logger.warning(f"请求失败 (尝试 {attempt+1}/{max_retries}): {e}")
if attempt < max_retries - 1:
time.sleep(random.uniform(3, 5))
else:
raise
if response.status_code == 400:
self.logger.warning(f"遇到 API 限制 (offset={offset}),停止获取")
break
response_data = response.json()
if not isinstance(response_data, dict) or "results" not in response_data:
self.logger.error(f"API 返回了意外的数据: {response_data}")
break
alphas = response_data["results"]
if not alphas:
break
fetched_alphas.extend(alphas)
total_accessed += len(alphas)
pages_fetched += 1
# 更新进度条
pbar.update(1)
pbar.set_postfix({
"Region": region,
"已获取": total_accessed,
"本页": len(alphas),
})
if len(alphas) < limit:
self.logger.info(f"本页数据不足 {limit} 条,已到达末尾")
break
offset += limit
# 添加延迟避免请求过快
time.sleep(0.5)
except Exception as e:
self.logger.error(f"请求失败: {e}")
break
pbar.close()
if not fetched_alphas:
self.logger.warning("未获取到任何 Alpha!")
return []
# 按 fitness 排序
df = pd.DataFrame(fetched_alphas)
df["temp_fitness"] = df.apply(
lambda row: row["is"].get("fitness", 0) if isinstance(row.get("is"), dict) else 0,
axis=1
)
df_sorted = df.sort_values(by="temp_fitness", ascending=False)
df_sorted = df_sorted.drop("temp_fitness", axis=1)
output_path = self.base_path / output_file_name
if mode == "w":
df_sorted.to_csv(output_path, index=False)
elif mode == "a":
df_sorted.to_csv(output_path, mode="a", index=False, header=False)
self.logger.info(f"Alpha 列表已保存!共 {len(fetched_alphas)} 条记录!\n文件路径: {output_path}")
return fetched_alphas
def close(self):
"""关闭客户端"""
if self.client:
self.client.close()
# 使用示例
if __name__ == "__main__":
# 创建获取器实例(会自动登录)
fetcher = SimpleAlphaFetcher()
if fetcher.client:
# 获取 Alpha 列表
results = fetcher.fetch_alphas(
max_pages=100, # 最多爬取 100 页
limit=100, # 每页 100 条
region="USA", # 美股市场
universe="TOP3000", # TOP3000 股票池
output_file_name="alpha_list.csv" # 输出文件名
)
print(f"成功获取 {len(results) if results else 0} 个 Alpha")
# 关闭客户端
fetcher.close()