You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
199 lines
7.1 KiB
199 lines
7.1 KiB
#!/usr/bin/env python3
|
|
"""
|
|
脚本功能:读取当前目录下的 llm_idea.md 文件,
|
|
匹配 **Implementation Example**: 后面的内容,
|
|
并提取每个示例中函数调用的参数(变量名)
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
def extract_implementation_examples(file_path):
|
|
"""
|
|
从markdown文件中提取所有Implementation Example的内容
|
|
|
|
Args:
|
|
file_path: Path对象,指向要读取的文件
|
|
|
|
Returns:
|
|
list: 匹配到的所有示例内容列表
|
|
"""
|
|
try:
|
|
content = file_path.read_text(encoding='utf-8')
|
|
except FileNotFoundError:
|
|
print(f"错误:找不到文件 {file_path}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"读取文件时出错:{e}")
|
|
return []
|
|
|
|
# 正则匹配 **Implementation Example**: 后面的内容
|
|
pattern = r'\*\*Implementation Example\*\*:\s*`(.*?)`'
|
|
matches = re.findall(pattern, content, re.DOTALL)
|
|
|
|
return matches
|
|
|
|
def extract_variables_from_expression(expression):
|
|
"""
|
|
从表达式中提取所有变量名(函数参数中的变量)
|
|
排除数字常量(如 1, 5, 20, 60 等)
|
|
|
|
Args:
|
|
expression: 函数调用表达式字符串
|
|
|
|
Returns:
|
|
list: 提取到的变量名列表(保持出现顺序)
|
|
"""
|
|
# 匹配函数调用中的参数:函数名(参数1, 参数2, ...)
|
|
# 使用正则匹配所有在括号内、由逗号分隔的内容
|
|
variables = []
|
|
|
|
# 方法1:匹配所有函数调用括号内的内容
|
|
# 这个正则匹配函数名(参数列表)
|
|
func_call_pattern = r'(\w+)\s*\(([^()]*(?:\([^()]*\)[^()]*)*)\)'
|
|
|
|
def extract_from_text(text):
|
|
"""递归提取文本中的变量"""
|
|
# 查找所有函数调用
|
|
matches = re.finditer(func_call_pattern, text)
|
|
for match in matches:
|
|
func_name = match.group(1)
|
|
args_str = match.group(2)
|
|
|
|
# 分割参数(考虑嵌套括号)
|
|
args = split_args_keeping_nesting(args_str)
|
|
|
|
# 处理每个参数
|
|
for arg in args:
|
|
arg = arg.strip()
|
|
# 如果参数是数字,跳过
|
|
if re.match(r'^-?\d+(\.\d+)?$', arg):
|
|
continue
|
|
# 如果参数是函数调用,递归提取
|
|
if '(' in arg and ')' in arg:
|
|
extract_from_text(arg)
|
|
# 如果参数是变量名(字母、数字、下划线组成,以字母或下划线开头)
|
|
elif re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', arg):
|
|
if arg not in variables: # 去重但保持顺序
|
|
variables.append(arg)
|
|
# 如果是复杂表达式(包含运算符),也尝试提取其中的变量
|
|
elif any(op in arg for op in ['+', '-', '*', '/', '>', '<', '=']):
|
|
extract_variables_from_complex_expression(arg)
|
|
|
|
def split_args_keeping_nesting(args_str):
|
|
"""
|
|
分割函数参数,考虑嵌套的括号
|
|
例如: "a, b, c" -> ['a', 'b', 'c']
|
|
"func(a,b), c" -> ['func(a,b)', 'c']
|
|
"""
|
|
args = []
|
|
current_arg = []
|
|
paren_count = 0
|
|
bracket_count = 0 # 方括号
|
|
brace_count = 0 # 花括号
|
|
|
|
for char in args_str:
|
|
if char == ',' and paren_count == 0 and bracket_count == 0 and brace_count == 0:
|
|
args.append(''.join(current_arg).strip())
|
|
current_arg = []
|
|
else:
|
|
current_arg.append(char)
|
|
if char == '(':
|
|
paren_count += 1
|
|
elif char == ')':
|
|
paren_count -= 1
|
|
elif char == '[':
|
|
bracket_count += 1
|
|
elif char == ']':
|
|
bracket_count -= 1
|
|
elif char == '{':
|
|
brace_count += 1
|
|
elif char == '}':
|
|
brace_count -= 1
|
|
|
|
if current_arg:
|
|
args.append(''.join(current_arg).strip())
|
|
|
|
return args
|
|
|
|
def extract_variables_from_complex_expression(expr):
|
|
"""从复杂表达式中提取变量(如 a + b * c)"""
|
|
# 匹配变量名(字母或下划线开头,后面跟字母、数字、下划线)
|
|
var_pattern = r'\b[a-zA-Z_][a-zA-Z0-9_]*\b'
|
|
# 排除常见的函数名和关键字
|
|
keywords = {'and', 'or', 'not', 'if', 'else', 'for', 'while', 'return',
|
|
'True', 'False', 'None', 'in', 'is', 'lambda'}
|
|
|
|
for match in re.finditer(var_pattern, expr):
|
|
var = match.group()
|
|
if var not in keywords and not re.match(r'^\d+$', var):
|
|
if var not in variables:
|
|
variables.append(var)
|
|
|
|
# 开始提取
|
|
extract_from_text(expression)
|
|
|
|
# 如果上面没提取到,尝试直接匹配简单变量(没有函数调用的表达式)
|
|
if not variables:
|
|
# 匹配简单的变量名
|
|
simple_var_pattern = r'\b[a-zA-Z_][a-zA-Z0-9_]*\b'
|
|
keywords = {'and', 'or', 'not', 'if', 'else', 'for', 'while', 'return',
|
|
'True', 'False', 'None', 'divide', 'subtract', 'add', 'abs',
|
|
'greater', 'equal', 'ts_delay', 'ts_mean', 'ts_std_dev',
|
|
'count_bias_adjusted_price_target_estimates', 'group_rank',
|
|
'zscore', 'days_from_last_change', 'ts_sum', 'power'}
|
|
|
|
for match in re.finditer(simple_var_pattern, expression):
|
|
var = match.group()
|
|
# 排除数字和函数名
|
|
if (not re.match(r'^\d+$', var) and
|
|
var not in keywords and
|
|
var not in ['and', 'or', 'not']):
|
|
if var not in variables:
|
|
variables.append(var)
|
|
|
|
return variables
|
|
|
|
def main():
|
|
# 使用Path模块获取当前文件路径
|
|
current_dir = Path.cwd()
|
|
file_path = current_dir / "llm_idea.md"
|
|
|
|
print(f"正在读取文件:{file_path}")
|
|
print("=" * 80)
|
|
|
|
# 提取所有Implementation Example
|
|
examples = extract_implementation_examples(file_path)
|
|
|
|
if not examples:
|
|
print("未找到匹配的 **Implementation Example**: `...` 内容")
|
|
return
|
|
|
|
print(f"找到 {len(examples)} 个 Implementation Example:\n")
|
|
|
|
# 遍历每个示例
|
|
for idx, example in enumerate(examples, 1):
|
|
print(f"{'='*80}")
|
|
print(f"示例 {idx}:")
|
|
print(f"{'-'*40}")
|
|
print(f"表达式: {example}")
|
|
print(f"{'-'*40}")
|
|
|
|
# 提取变量名
|
|
variables = extract_variables_from_expression(example)
|
|
|
|
if variables:
|
|
print(f"提取到的变量 ({len(variables)} 个):")
|
|
for var_idx, var in enumerate(variables, 1):
|
|
print(f" {var_idx}. {var}")
|
|
else:
|
|
print("未提取到变量名")
|
|
|
|
print() # 空行分隔
|
|
|
|
print("=" * 80)
|
|
print(f"处理完成!共处理 {len(examples)} 个示例")
|
|
|
|
if __name__ == "__main__":
|
|
main() |