# -*- coding: utf-8 -*- import os import jieba import csv def process_text(text): """ 使用jieba分词并过滤不需要的字符 """ filter_list = ['\n', '\t', '\r', '\b', '\f', '\v', ':', '的', '或', '10', '天', '了', '可', '是', '该', ',', ' ', '、', '让', '和', '集', '/', '日', '在', '(', '_', '-', ')', '(', '上', '距', '与', '比', '下', '及', ')', '...', ';', '%', '&', '+', ',', '.', ':', ';', '<', '=', '>', '?', '[', ']', '|', '—', '。' ] text_list = jieba.lcut(text) results = [] for tl in text_list: should_include = True for fl in filter_list: if fl == tl: should_include = False break if should_include: results.append(tl) if results: return list(set(results)) # 去重 else: return None def search_data_sets_by_keywords(csv_file_path, keywords): """ 根据关键词搜索csv文件中的匹配项 Args: csv_file_path: CSV文件路径 keywords: 关键词列表 Returns: 匹配的数据集列表 """ if not os.path.exists(csv_file_path): print(f"文件不存在: {csv_file_path}") return [] data_dict = {} # 使用字典来存储,以id为键去重 with open(csv_file_path, 'r', encoding='utf-8') as f: reader = csv.reader(f) for row in reader: # 检查每一行的第12列(索引11)和第13列(索引12)是否包含任意关键词 for key in keywords: if key in row[11] or key in row[12]: item_id = row[0] # 如果id不存在,或者想要保留第一个出现的记录 if item_id not in data_dict: data_dict[item_id] = { 'id': item_id, 'data_set_name': row[1], 'description': row[2], 'description_cn': row[11], } # 将字典的值转换为列表 return list(data_dict.values()) def extract_keywords_from_text(text_file_path): """ 从文本文件中提取关键词 Args: text_file_path: 文本文件路径 Returns: 提取的关键词列表 """ if not os.path.exists(text_file_path): print(f"文件不存在: {text_file_path}") return None with open(text_file_path, 'r', encoding='utf-8') as f: text_list = [line.strip() for line in f if line.strip()] if not text_list: print('关键词文本无数据') return None # 将所有文本合并并用分号连接,然后进行处理 result_str = process_text(';'.join(text_list)) if result_str: print(f'关键词提取结果: {result_str}') return result_str else: return None def main(): keys_text_path = "keys_text.txt" keywords = extract_keywords_from_text(keys_text_path) if not keywords: print("无法提取关键词") return csv_file_path = "all_data_combined.csv" matched_data_sets = search_data_sets_by_keywords(csv_file_path, keywords) print(f'从数据集中提取了 {len(matched_data_sets)} 条匹配数据') for data_set in matched_data_sets: print(f"数据集: {data_set['data_set_name']}") print(f"英文描述: {data_set['description']}") print(f"中文描述: {data_set['description_cn']}") print("-" * 50) if __name__ == "__main__": main()