You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
FieldDownloader/main.py

466 lines
16 KiB

# -*- coding: utf-8 -*-
import os
import random
import time
import sqlite3
import httpx
from httpx import BasicAuth
class DataSetDownloader:
def __init__(self):
self.base_api_url = 'https://api.worldquantbrain.com'
self.db_path = os.path.join(os.getcwd(), 'data_sets.db')
self.client = None
self.login_time = None
self.total_tasks = 0
self.completed_tasks = 0
self.current_task_index = 0
self.login()
def login(self):
"""登录并返回客户端实例"""
username, password = "jack0210_@hotmail.com", "!QAZ2wsx+0913"
client = httpx.Client(auth=BasicAuth(username, password))
try:
response = client.post(f'{self.base_api_url}/authentication')
print(f"登录状态: {response.status_code}")
if response.status_code in [200, 201]:
print("登录成功!")
self.client = client
self.login_time = time.time()
return client
else:
print(f"登录失败: {response.json()}")
exit(1)
except Exception as e:
print(f"登录过程中出现错误: {e}")
exit(1)
def check_and_renew_login(self):
"""检查登录状态,超过3.5小时则重新登录"""
if not self.login_time:
return
current_time = time.time()
elapsed_hours = (current_time - self.login_time) / 3600
if elapsed_hours > 3.5:
print(f"\n 登录已超过{elapsed_hours:.1f}小时,正在重新登录...")
self.login()
print("✅ 重新登录成功,继续执行任务")
def get_download_stats(self):
"""获取下载统计信息"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# 获取总任务数
cursor.execute('SELECT COUNT(*) FROM category WHERE downloaded = 0')
remaining_tasks = cursor.fetchone()[0]
# 获取已完成任务数
cursor.execute('SELECT COUNT(*) FROM category WHERE downloaded = 1')
completed_tasks = cursor.fetchone()[0]
conn.close()
total_tasks = remaining_tasks + completed_tasks
progress_percentage = (completed_tasks / total_tasks * 100) if total_tasks > 0 else 0
return {
'total_tasks': total_tasks,
'remaining_tasks': remaining_tasks,
'completed_tasks': completed_tasks,
'progress_percentage': progress_percentage
}
def print_progress_bar(self, current, total, prefix='', suffix='', length=50, fill=''):
"""打印进度条"""
percent = f"{100 * (current / float(total)):.1f}"
filled_length = int(length * current // total)
bar = fill * filled_length + '-' * (length - filled_length)
progress_text = f'\r{prefix} |{bar}| {percent}% {suffix}'
if current == total:
print(progress_text)
else:
print(progress_text, end='', flush=True)
def test_download(self):
"""测试每个category分类是否能成功下载数据"""
if not self.client:
print("❌ 客户端未初始化")
exit(1)
print("\n" + "=" * 60)
print("开始测试所有category分类...")
print("=" * 60)
# 从数据库获取每个category分类的一条测试记录
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# 使用DISTINCT获取每个category.name的第一条记录
cursor.execute('''
SELECT DISTINCT name, category_id, region, universe, instrumentType, delay
FROM category
WHERE downloaded = 0
ORDER BY name
''')
test_categories = cursor.fetchall()
conn.close()
print(f"找到 {len(test_categories)} 个不同的category分类进行测试")
success_count = 0
fail_count = 0
for idx, category in enumerate(test_categories):
# 检查登录状态
self.check_and_renew_login()
category_name, category_id, region, universe, instrumentType, delay = category
print(f"\n🔍 测试 [{idx + 1}/{len(test_categories)}]: {category_name}")
print(f" category_id: {category_id}")
print(f" region: {region}, universe: {universe}")
try:
# 测试下载(只下载第一页的1条数据)
result = self._test_single_category(
category_name, category_id, region, universe, instrumentType, delay
)
if result:
print(f" ✅ 测试成功")
success_count += 1
else:
print(f" ❌ 测试失败")
fail_count += 1
except Exception as e:
print(f" ❌ 测试异常: {e}")
fail_count += 1
# 打印测试进度
self.print_progress_bar(idx + 1, len(test_categories),
prefix='测试进度:',
suffix=f'完成 {success_count} 成功, {fail_count} 失败')
# 测试间隔
time.sleep(random.uniform(3, 5))
# 输出测试结果
print("\n" + "=" * 60)
print("测试结果汇总:")
print("=" * 60)
print(f"总测试分类数: {len(test_categories)}")
print(f"成功: {success_count}")
print(f"失败: {fail_count}")
if fail_count == 0:
print("\n🎉 所有category分类测试通过!")
else:
print(f"\n{fail_count} 个分类测试失败")
return fail_count == 0
def _test_single_category(self, category_name, category_id, region, universe, instrumentType, delay):
"""测试单个category是否能成功下载"""
endpoint = 'data-fields'
url = f"{self.base_api_url}/{endpoint}"
# 构建参数,只请求1条数据
params = {
'dataset.id': category_id,
'delay': delay,
'instrumentType': instrumentType,
'limit': 1,
'offset': 0,
'region': region,
'universe': universe
}
try:
response = self.client.get(url, params=params, timeout=30)
if response.status_code != 200:
print(f" HTTP状态码: {response.status_code}")
return False
data = response.json()
total_count = data.get('count', 0)
results = data.get('results', [])
print(f" 数据总数: {total_count}, 本页获取: {len(results)}")
if total_count > 0 and len(results) > 0:
# 打印第一条数据的部分信息用于验证
item = results[0]
print(f" 示例数据字段 - id: {item.get('id', 'N/A')}, name: {item.get('name', 'N/A')}")
return True
else:
print(f" 无数据返回")
return total_count == 0 # 如果count=0也算测试成功(只是没数据)
except httpx.TimeoutException:
print(" ⏰ 请求超时")
return False
except Exception as e:
print(f" ❌ 请求异常: {e}")
return False
def get_categories_to_download(self):
"""从数据库获取需要下载的category记录"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
SELECT name, base_url, category_id, region, universe, instrumentType, delay
FROM category
WHERE downloaded = 0
ORDER BY rowid
''')
categories = cursor.fetchall()
conn.close()
# 更新统计信息
self.total_tasks = len(categories)
self.completed_tasks = 0
self.current_task_index = 0
print(f"找到 {self.total_tasks} 个需要下载的category")
# 打印总体进度
if self.total_tasks > 0:
stats = self.get_download_stats()
print(f"总体进度: {stats['completed_tasks']}/{stats['total_tasks']} ({stats['progress_percentage']:.1f}%)")
return categories
def _build_params(self, category_id, region, universe, instrumentType, delay, offset=0, limit=50):
"""构建data-fields请求参数"""
return {
'dataset.id': category_id,
'delay': delay,
'instrumentType': instrumentType,
'limit': limit,
'offset': offset,
'region': region,
'universe': universe
}
def _process_item(self, item):
"""处理单个data-field项 - name字段对应返回的id字段"""
return {
'name': item.get('id', ''), # name字段对应返回的id字段
'description': item.get('description', ''),
'dataset_id': item.get('dataset', {}).get('id', ''),
'dataset_name': item.get('dataset', {}).get('name', ''),
'category_id': item.get('category', {}).get('id', ''),
'category_name': item.get('category', {}).get('name', ''),
'region': item.get('region', ''),
'delay': str(item.get('delay', '')),
'universe': item.get('universe', ''),
'type': item.get('type', '')
}
def save_to_data_sets_table(self, data_items):
"""批量保存到data_sets表"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
for item in data_items:
cursor.execute('''
INSERT INTO data_sets (name, description, dataset_id, dataset_name,
category_id, category_name, region, delay,
universe, type)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
''', (
item['name'], item['description'], item['dataset_id'], item['dataset_name'],
item['category_id'], item['category_name'], item['region'], item['delay'],
item['universe'], item['type']
))
conn.commit()
conn.close()
print(f"✅ 保存了 {len(data_items)} 条记录到 data_sets 表")
def update_category_status(self, category_id, region, universe):
"""更新category表的下载状态"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
UPDATE category
SET downloaded = 1
WHERE category_id = ?
AND region = ?
AND universe = ?
''', (category_id, region, universe))
conn.commit()
conn.close()
# 更新进度统计
self.completed_tasks += 1
stats = self.get_download_stats()
print(f"✅ 已更新category状态: {category_id} - {region} - {universe}")
print(f"📊 总体进度: {stats['completed_tasks']}/{stats['total_tasks']} ({stats['progress_percentage']:.1f}%)")
def download_category_data_fields(self, category_name, category_id, region, universe, instrumentType, delay):
"""下载指定category的所有data-fields"""
endpoint = 'data-fields'
url = f"{self.base_api_url}/{endpoint}"
print(f"\n开始下载: {category_name} - {category_id} - {region} - {universe}")
# 打印当前任务进度
self.current_task_index += 1
print(f"📋 任务进度: {self.current_task_index}/{self.total_tasks}")
# 获取数据总数
params = self._build_params(category_id, region, universe, instrumentType, delay, limit=1)
response = self.client.get(url, params=params)
if response.status_code != 200:
print(f"❌ 获取数据总数失败: {response.status_code}")
exit(1)
data = response.json()
total_count = data.get('count', 0)
print(f"📊 数据总数: {total_count}")
if total_count == 0:
print(" 没有找到数据,跳过")
return
# 下载所有数据
limit = 50
all_data_items = []
for offset in range(0, total_count, limit):
# 每次循环开始前检查登录状态
self.check_and_renew_login()
time.sleep(random.uniform(2, 2.5))
params = self._build_params(category_id, region, universe, instrumentType, delay, offset, limit)
# 打印数据下载进度
data_progress = f" {offset}/{total_count} ({offset / total_count * 100:.1f}%)"
print(f"📥 数据下载进度:{data_progress}")
response = self.client.get(url, params=params)
if response.status_code != 200:
print(f"❌ 下载失败: {response.status_code}")
exit(1)
data = response.json()
results = data.get('results', [])
# 处理数据
processed_items = [self._process_item(item) for item in results]
all_data_items.extend(processed_items)
print(f"✅ 本页获取到 {len(results)} 条记录")
if len(results) < limit:
print("🎯 到达数据末尾")
break
# 保存到数据库
if all_data_items:
self.save_to_data_sets_table(all_data_items)
def run(self):
"""主运行函数"""
if not self.client:
print("❌ 客户端未初始化")
exit(1)
# 打印开始时的总体进度
print("\n" + "=" * 60)
print("开始下载任务...")
stats = self.get_download_stats()
print(f"初始状态: 已完成 {stats['completed_tasks']}/{stats['total_tasks']} ({stats['progress_percentage']:.1f}%)")
print("=" * 60)
categories = self.get_categories_to_download()
if not categories:
print("✅ 所有category都已下载完成")
return
for category in categories:
# 每次循环开始前检查登录状态
self.check_and_renew_login()
category_name, base_url, category_id, region, universe, instrumentType, delay = category
print(f"\n{'=' * 60}")
print(f"处理category: {category_name}")
print(f"category_id: {category_id}")
print(f"region: {region}, universe: {universe}")
print(f"{'=' * 60}")
try:
# 下载data-fields
self.download_category_data_fields(
category_name, category_id, region, universe, instrumentType, delay
)
# 更新下载状态
self.update_category_status(category_id, region, universe)
print(f"✅ 完成: {category_name} - {region} - {universe}")
except Exception as e:
print(f"❌ 处理失败: {e}")
return
sleep_time = random.uniform(20, 30)
print(f"\n等待 {sleep_time} 秒后继续下一个...")
time.sleep(sleep_time)
# 打印最终进度
print("\n" + "=" * 60)
print("🎉 所有category处理完成!")
final_stats = self.get_download_stats()
print(f"最终状态: 已完成 {final_stats['completed_tasks']}/{final_stats['total_tasks']} ({final_stats['progress_percentage']:.1f}%)")
print("=" * 60)
exit(0)
if __name__ == "__main__":
# category_list = [
# 'analyst',
# 'broker',
# 'earnings',
# 'fundamental',
# 'imbalance',
# 'insiders',
# 'institutions',
# 'macro',
# 'model',
# 'news',
# 'option',
# 'other',
# 'pv',
# 'risk',
# 'sentiment',
# 'shortinterest',
# 'socialmedia'
# ]
downloader = DataSetDownloader()
while True:
downloader.run()
time.sleep(30)