alpha_tools/data_sets/get_datasets_local/seach_data_sets.py

# -*- coding: utf-8 -*-
import os
import jieba
import csv


def process_text(text):
    """
    使用jieba分词并过滤不需要的字符
    """
    filter_list = ['\n', '\t', '\r', '\b', '\f', '\v', '：', '的', '或', '10', '天', '了', '可', '是', '该', '，', ' ', '、', '让', '和', '集',
                   '/', '日', '在', '（', '_', '-', ')', '(', '上', '距', '与', '比', '下', '及', '）', '...', '；', '%', '&', '+', ',', '.',
                   ':', ';', '<', '=', '>', '?', '[', ']', '|', '—', '。'
                   ]

    text_list = jieba.lcut(text)
    results = []
    for tl in text_list:
        should_include = True
        for fl in filter_list:
            if fl == tl:
                should_include = False
                break
        if should_include:
            results.append(tl)

    if results:
        return list(set(results))  # 去重
    else:
        return None


def search_data_sets_by_keywords(csv_file_path, keywords):
    """
    根据关键词搜索csv文件中的匹配项

    Args:
        csv_file_path: CSV文件路径
        keywords: 关键词列表

    Returns:
        匹配的数据集列表
    """
    if not os.path.exists(csv_file_path):
        print(f"文件不存在: {csv_file_path}")
        return []

    data_dict = {}  # 使用字典来存储，以id为键去重

    with open(csv_file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            # 检查每一行的第12列(索引11)和第13列(索引12)是否包含任意关键词
            for key in keywords:
                if key in row[11] or key in row[12]:
                    item_id = row[0]
                    # 如果id不存在，或者想要保留第一个出现的记录
                    if item_id not in data_dict:
                        data_dict[item_id] = {
                            'id': item_id,
                            'data_set_name': row[1],
                            'description': row[2],
                            'description_cn': row[11],
                        }

    # 将字典的值转换为列表
    return list(data_dict.values())


def extract_keywords_from_text(text_file_path):
    """
    从文本文件中提取关键词

    Args:
        text_file_path: 文本文件路径

    Returns:
        提取的关键词列表
    """
    if not os.path.exists(text_file_path):
        print(f"文件不存在: {text_file_path}")
        return None

    with open(text_file_path, 'r', encoding='utf-8') as f:
        text_list = [line.strip() for line in f if line.strip()]
        if not text_list:
            print('关键词文本无数据')
            return None

        # 将所有文本合并并用分号连接，然后进行处理
        result_str = process_text(';'.join(text_list))

        if result_str:
            print(f'关键词提取结果: {result_str}')
            return result_str
        else:
            return None


def main():
    keys_text_path = "keys_text.txt"
    keywords = extract_keywords_from_text(keys_text_path)

    if not keywords:
        print("无法提取关键词")
        return

    csv_file_path = "all_data_combined.csv"
    matched_data_sets = search_data_sets_by_keywords(csv_file_path, keywords)

    print(f'从数据集中提取了 {len(matched_data_sets)} 条匹配数据')

    for data_set in matched_data_sets:
        print(f"数据集: {data_set['data_set_name']}")
        print(f"英文描述: {data_set['description']}")
        print(f"中文描述: {data_set['description_cn']}")
        print("-" * 50)


if __name__ == "__main__":
    main()