You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
121 lines
3.6 KiB
121 lines
3.6 KiB
# -*- coding: utf-8 -*-
|
|
import os
|
|
import jieba
|
|
import csv
|
|
|
|
|
|
def process_text(text):
|
|
"""
|
|
使用jieba分词并过滤不需要的字符
|
|
"""
|
|
filter_list = ['\n', '\t', '\r', '\b', '\f', '\v', ':', '的', '或', '10', '天', '了', '可', '是', '该', ',', ' ', '、', '让', '和', '集',
|
|
'/', '日', '在', '(', '_', '-', ')', '(', '上', '距', '与', '比', '下', '及', ')', '...', ';', '%', '&', '+', ',', '.',
|
|
':', ';', '<', '=', '>', '?', '[', ']', '|', '—', '。'
|
|
]
|
|
|
|
text_list = jieba.lcut(text)
|
|
results = []
|
|
for tl in text_list:
|
|
should_include = True
|
|
for fl in filter_list:
|
|
if fl == tl:
|
|
should_include = False
|
|
break
|
|
if should_include:
|
|
results.append(tl)
|
|
|
|
if results:
|
|
return list(set(results)) # 去重
|
|
else:
|
|
return None
|
|
|
|
|
|
def search_data_sets_by_keywords(csv_file_path, keywords):
|
|
"""
|
|
根据关键词搜索csv文件中的匹配项
|
|
|
|
Args:
|
|
csv_file_path: CSV文件路径
|
|
keywords: 关键词列表
|
|
|
|
Returns:
|
|
匹配的数据集列表
|
|
"""
|
|
if not os.path.exists(csv_file_path):
|
|
print(f"文件不存在: {csv_file_path}")
|
|
return []
|
|
|
|
data_dict = {} # 使用字典来存储,以id为键去重
|
|
|
|
with open(csv_file_path, 'r', encoding='utf-8') as f:
|
|
reader = csv.reader(f)
|
|
for row in reader:
|
|
# 检查每一行的第12列(索引11)和第13列(索引12)是否包含任意关键词
|
|
for key in keywords:
|
|
if key in row[11] or key in row[12]:
|
|
item_id = row[0]
|
|
# 如果id不存在,或者想要保留第一个出现的记录
|
|
if item_id not in data_dict:
|
|
data_dict[item_id] = {
|
|
'id': item_id,
|
|
'data_set_name': row[1],
|
|
'description': row[2],
|
|
'description_cn': row[11],
|
|
}
|
|
|
|
# 将字典的值转换为列表
|
|
return list(data_dict.values())
|
|
|
|
|
|
def extract_keywords_from_text(text_file_path):
|
|
"""
|
|
从文本文件中提取关键词
|
|
|
|
Args:
|
|
text_file_path: 文本文件路径
|
|
|
|
Returns:
|
|
提取的关键词列表
|
|
"""
|
|
if not os.path.exists(text_file_path):
|
|
print(f"文件不存在: {text_file_path}")
|
|
return None
|
|
|
|
with open(text_file_path, 'r', encoding='utf-8') as f:
|
|
text_list = [line.strip() for line in f if line.strip()]
|
|
if not text_list:
|
|
print('关键词文本无数据')
|
|
return None
|
|
|
|
# 将所有文本合并并用分号连接,然后进行处理
|
|
result_str = process_text(';'.join(text_list))
|
|
|
|
if result_str:
|
|
print(f'关键词提取结果: {result_str}')
|
|
return result_str
|
|
else:
|
|
return None
|
|
|
|
|
|
def main():
|
|
keys_text_path = "keys_text.txt"
|
|
keywords = extract_keywords_from_text(keys_text_path)
|
|
|
|
if not keywords:
|
|
print("无法提取关键词")
|
|
return
|
|
|
|
csv_file_path = "all_data_combined.csv"
|
|
matched_data_sets = search_data_sets_by_keywords(csv_file_path, keywords)
|
|
|
|
print(f'从数据集中提取了 {len(matched_data_sets)} 条匹配数据')
|
|
|
|
for data_set in matched_data_sets:
|
|
print(f"数据集: {data_set['data_set_name']}")
|
|
print(f"英文描述: {data_set['description']}")
|
|
print(f"中文描述: {data_set['description_cn']}")
|
|
print("-" * 50)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |