You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

281 lines
9.8 KiB

import os
import sys
import pandas as pd
import json
import re
from pathlib import Path
from typing import List
# Add get_knowledgeBase_tool to path
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
TOOL_DIR = os.path.join(SCRIPT_DIR, "get_knowledgeBase_tool")
if TOOL_DIR not in sys.path:
sys.path.insert(0, TOOL_DIR)
# Import from tool directory
sys.path.insert(0, TOOL_DIR)
import ace_lib
from fetch_all_operators import fetch_operators, prompt_credentials
from fetch_all_documentation import (
fetch_tutorials,
fetch_tutorial_pages,
fetch_page,
_extract_page_id,
)
# Dataset fetching currently disabled per request
# from fetch_all_datasets import (
# fetch_all_combinations,
# fetch_datasets_for_combo,
# merge_and_deduplicate,
# )
def ensure_knowledge_dir():
"""Ensure knowledge directory exists"""
knowledge_dir = os.path.join(SCRIPT_DIR, "knowledge")
os.makedirs(knowledge_dir, exist_ok=True)
return knowledge_dir
def to_jsonable(value):
"""Convert values to JSON-serializable, handling NaN and nested structures."""
try:
if isinstance(value, float) and pd.isna(value):
return None
except TypeError:
pass
if isinstance(value, list):
return [to_jsonable(v) for v in value if not (isinstance(v, float) and pd.isna(v))]
if isinstance(value, dict):
return {k: to_jsonable(v) for k, v in value.items()}
if isinstance(value, (str, int, float, bool)) or value is None:
return value
return str(value)
def safe_filename(name: str, suffix: str = "") -> str:
base = re.sub(r"[^A-Za-z0-9._-]+", "_", str(name)).strip("_") or "doc"
base = base[:80]
return f"{base}{suffix}"
def process_operators(session: ace_lib.SingleSession, knowledge_dir: str):
"""
Process operators and save as JSON files
Args:
session: Authenticated BRAIN session
knowledge_dir: Directory to save JSON files
"""
print("\n=== Processing Operators ===")
# Fetch operators data
print("Fetching operators...")
operators_df = fetch_operators(session)
if operators_df.empty:
print("No operators found!")
return
print(f"Found {len(operators_df)} operator entries")
# Get unique categories
categories = sorted(operators_df['category'].dropna().unique())
for category in categories:
category_data = operators_df[operators_df['category'] == category].copy()
# Create JSON file for this category
filename = f"{category.replace(' ', '_').lower()}_operators.json"
filepath = os.path.join(knowledge_dir, filename)
print(f"Processing category: {category}")
# Convert to list of dicts
category_list = []
for idx, row in category_data.iterrows():
operator_dict = {}
for col in row.index:
value = row[col]
operator_dict[col] = to_jsonable(value)
category_list.append(operator_dict)
# Save category JSON
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(category_list, f, ensure_ascii=False, indent=2)
print(f"✓ Created {filename} with {len(category_list)} operators")
# Dataset fetching intentionally disabled; keep for potential re-enable.
# def process_datasets(session: ace_lib.SingleSession, dataset_dir: str):
# """Fetch datasets and save one JSON per region."""
# print("=== Processing Datasets ===")
#
# print("Fetching valid instrument/region/delay/universe combinations...")
# options_df = fetch_all_combinations(session)
# if options_df is None or options_df.empty:
# print("No simulation options fetched; aborting dataset fetch.")
# return
#
# all_datasets: list[pd.DataFrame] = []
# combo_idx = 0
#
# for _, row in options_df.iterrows():
# instrument_type = row.get("InstrumentType")
# region = row.get("Region")
# delay = row.get("Delay")
# universes = row.get("Universe") or []
#
# for universe in universes:
# combo_idx += 1
# print(f"[{combo_idx}] {instrument_type} / {region} / D{delay} / {universe}")
# try:
# df = fetch_datasets_for_combo(session, instrument_type, region, delay, universe)
# print(f" -> {len(df)} rows")
# all_datasets.append(df)
# except Exception as exc:
# print(f" -> Failed: {exc}")
#
# if not all_datasets:
# print("No datasets fetched; nothing to save.")
# return
#
# combined_df = pd.concat([df for df in all_datasets if not df.empty], ignore_index=True)
# if combined_df.empty:
# print("No datasets fetched; nothing to save.")
# return
#
# regions = sorted(combined_df["param_region"].dropna().unique())
# print(f"Found regions: {', '.join(regions)}")
#
# for region in regions:
# region_df = combined_df[combined_df["param_region"] == region]
# region_unique = merge_and_deduplicate([region_df])
#
# region_list = []
# for _, row in region_unique.iterrows():
# record = {col: to_jsonable(row[col]) for col in row.index}
# region_list.append(record)
#
# filename = f"{region.replace(' ', '_').lower()}_datasets.json"
# filepath = os.path.join(dataset_dir, filename)
# with open(filepath, "w", encoding="utf-8") as f:
# json.dump(region_list, f, ensure_ascii=False, indent=2)
#
# print(f"✓ Created {filename} with {len(region_list)} datasets")
def process_documentation(session: ace_lib.SingleSession, knowledge_dir: str):
"""Fetch tutorials and pages, save one JSON per page."""
print("=== Processing Documentation ===")
tutorials = fetch_tutorials(session)
if not tutorials:
print("No tutorials fetched; skipping documentation.")
return
print(f"Fetched {len(tutorials)} tutorials")
page_count = 0
seen_pages = set()
for idx, tutorial in enumerate(tutorials, start=1):
tutorial_id = _extract_page_id(tutorial) or f"tutorial_{idx}"
tutorial_title = tutorial.get("title") or tutorial_id
page_candidates = []
if isinstance(tutorial.get("pages"), list):
page_candidates.extend(tutorial["pages"])
if tutorial_id:
try:
page_candidates.extend(fetch_tutorial_pages(session, tutorial_id))
except Exception as exc:
print(f"[{idx:03d}] failed to fetch pages for {tutorial_id}: {exc}")
if not page_candidates and tutorial_id:
page_candidates.append({"id": tutorial_id, "title": tutorial_title})
for page_entry in page_candidates:
page_id = _extract_page_id(page_entry)
if not page_id or page_id in seen_pages:
continue
seen_pages.add(page_id)
try:
page = fetch_page(session, page_id)
except Exception as exc:
print(f"[{idx:03d}] page {page_id} failed: {exc}")
continue
page_count += 1
page_title = page.get("title") or page_entry.get("title") or page_id
# Save each page as individual JSON
filename = safe_filename(f"{idx:03d}_{page_title}", "_documentation.json")
filepath = os.path.join(knowledge_dir, filename)
with open(filepath, "w", encoding="utf-8") as f:
json.dump(to_jsonable(page), f, ensure_ascii=False, indent=2)
print(f"[{idx:03d}] ✓ Created {filename}")
print(f"✓ Total: {page_count} documentation pages saved")
def main():
print("=== BRAIN Knowledge Base Processor ===")
print("Starting operator processing...\n")
# Get credentials
email, password = prompt_credentials()
ace_lib.get_credentials = lambda: (email, password)
print("Logging in to BRAIN platform...")
try:
session = ace_lib.start_session()
print("✓ Login successful\n")
except Exception as exc:
print(f"✗ Login failed: {exc}")
return
# Ensure knowledge directory exists
knowledge_dir = ensure_knowledge_dir()
# dataset_dir = knowledge_dir # Save datasets directly under knowledge (disabled)
print(f"Knowledge directory: {knowledge_dir}\n")
# Process documentation (tutorials/pages)
print("\nStarting documentation processing...\n")
try:
process_documentation(session, knowledge_dir)
except Exception as exc:
print(f"✗ Failed to process documentation: {exc}")
import traceback
traceback.print_exc()
return
# Process operators
try:
process_operators(session, knowledge_dir)
except Exception as exc:
print(f"✗ Failed to process operators: {exc}")
import traceback
traceback.print_exc()
return
# Dataset processing disabled; re-enable by uncommenting the block below.
# print("\nStarting dataset processing...\n")
# try:
# process_datasets(session, dataset_dir)
# except Exception as exc:
# print(f"✗ Failed to process datasets: {exc}")
# import traceback
# traceback.print_exc()
# return
print("\n=== Processing Complete ===")
if __name__ == "__main__":
main()