++

6 months ago · 717f8c1747
parent f48f8c5550
commit 717f8c1747
3 changed files with 2780 additions and 0 deletions
--- a/data_sets/all_data_combined.csv
+++ b/data_sets/all_data_combined.csv
--- a/data_sets/keys_text.txt
+++ b/data_sets/keys_text.txt
@ -0,0 +1 @@
+["implied_volatility", "iv", "volatility", "call", "put", "option", "skew", "strike", "moneyness", "vix", "variance", "delta", "gamma", "vega", "theta", "atm", "otm", "itm", "surface", "term", "expiry", "risk_reversal", "butterfly", "spread", "premium", "volume", "open_interest", "bid", "ask", "mid", "spread", "ratio", "percentile", "rank", "zscore", "decay", "momentum", "trend", "sum", "mean", "std", "corr", "beta", "residual", "resid", "regression", "factor", "alpha", "exposure", "neutralized", "industry", "sector", "market_cap", "volume", "liquidity", "turnover", "float", "short_interest", "borrow_fee", "dividend", "earnings", "surprise", "revision", "estimate", "actual", "guidance", "sentiment", "news", "analyst", "rating", "target", "recommendation", "upgrade", "downgrade", "initiation", "coverage", "momentum", "reversal", "value", "growth", "quality", "leverage", "profitability", "efficiency", "solvency", "liquidity", "accruals", "investment", "intangibles", "f_score", "z_score", "o_score", "m_score", "g_score", "p_score"]
--- a/data_sets/seach_data_sets.py
+++ b/data_sets/seach_data_sets.py
@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+import os
+import jieba
+import csv
+
+
+def process_text(text):
+    """
+    使用jieba分词并过滤不需要的字符
+    """
+    filter_list = ['\n', '\t', '\r', '\b', '\f', '\v', '：', '的', '或', '10', '天', '了', '可', '是', '该', '，', ' ', '、', '让', '和', '集',
+                   '/', '日', '在', '（', '_', '-', ')', '(', '上', '距', '与', '比', '下', '及', '）', '...', '；', '%', '&', '+', ',', '.',
+                   ':', ';', '<', '=', '>', '?', '[', ']', '|', '—', '。'
+                   ]
+
+    text_list = jieba.lcut(text)
+    results = []
+    for tl in text_list:
+        should_include = True
+        for fl in filter_list:
+            if fl == tl:
+                should_include = False
+                break
+        if should_include:
+            results.append(tl)
+
+    if results:
+        return list(set(results))  # 去重
+    else:
+        return None
+
+
+def search_data_sets_by_keywords(csv_file_path, keywords):
+    """
+    根据关键词搜索csv文件中的匹配项
+    
+    Args:
+        csv_file_path: CSV文件路径
+        keywords: 关键词列表
+    
+    Returns:
+        匹配的数据集列表
+    """
+    if not os.path.exists(csv_file_path):
+        print(f"文件不存在: {csv_file_path}")
+        return []
+
+    data_dict = {}  # 使用字典来存储，以id为键去重
+
+    with open(csv_file_path, 'r', encoding='utf-8') as f:
+        reader = csv.reader(f)
+        for row in reader:
+            # 检查每一行的第12列(索引11)和第13列(索引12)是否包含任意关键词
+            for key in keywords:
+                if key in row[11] or key in row[12]:
+                    item_id = row[0]
+                    # 如果id不存在，或者想要保留第一个出现的记录
+                    if item_id not in data_dict:
+                        data_dict[item_id] = {
+                            'id': item_id,
+                            'data_set_name': row[1],
+                            'description': row[2],
+                            'description_cn': row[11],
+                        }
+
+    # 将字典的值转换为列表
+    return list(data_dict.values())
+
+
+def extract_keywords_from_text(text_file_path):
+    """
+    从文本文件中提取关键词
+    
+    Args:
+        text_file_path: 文本文件路径
+    
+    Returns:
+        提取的关键词列表
+    """
+    if not os.path.exists(text_file_path):
+        print(f"文件不存在: {text_file_path}")
+        return None
+    
+    with open(text_file_path, 'r', encoding='utf-8') as f:
+        text_list = [line.strip() for line in f if line.strip()]
+        if not text_list:
+            print('关键词文本无数据')
+            return None
+
+        # 将所有文本合并并用分号连接，然后进行处理
+        result_str = process_text(';'.join(text_list))
+        
+        if result_str:
+            print(f'关键词提取结果: {result_str}')
+            return result_str
+        else:
+            return None
+
+
+def main():
+    keys_text_path = "keys_text.txt"
+    keywords = extract_keywords_from_text(keys_text_path)
+    
+    if not keywords:
+        print("无法提取关键词")
+        return
+    
+    csv_file_path = "all_data_combined.csv"
+    matched_data_sets = search_data_sets_by_keywords(csv_file_path, keywords)
+    
+    print(f'从数据集中提取了 {len(matched_data_sets)} 条匹配数据')
+    
+    for data_set in matched_data_sets:
+        print(f"数据集: {data_set['data_set_name']}")
+        print(f"英文描述: {data_set['description']}")
+        print(f"中文描述: {data_set['description_cn']}")
+        print("-" * 50)
+
+
+if __name__ == "__main__":
+    main()
				`@ -0,0 +1 @@`
				["implied_volatility", "iv", "volatility", "call", "put", "option", "skew", "strike", "moneyness", "vix", "variance", "delta", "gamma", "vega", "theta", "atm", "otm", "itm", "surface", "term", "expiry", "risk_reversal", "butterfly", "spread", "premium", "volume", "open_interest", "bid", "ask", "mid", "spread", "ratio", "percentile", "rank", "zscore", "decay", "momentum", "trend", "sum", "mean", "std", "corr", "beta", "residual", "resid", "regression", "factor", "alpha", "exposure", "neutralized", "industry", "sector", "market_cap", "volume", "liquidity", "turnover", "float", "short_interest", "borrow_fee", "dividend", "earnings", "surprise", "revision", "estimate", "actual", "guidance", "sentiment", "news", "analyst", "rating", "target", "recommendation", "upgrade", "downgrade", "initiation", "coverage", "momentum", "reversal", "value", "growth", "quality", "leverage", "profitability", "efficiency", "solvency", "liquidity", "accruals", "investment", "intangibles", "f_score", "z_score", "o_score", "m_score", "g_score", "p_score"]