You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
28 lines
818 B
28 lines
818 B
import jieba
|
|
'''
|
|
数据库中读取数据集描述, 转换成标签
|
|
'''
|
|
|
|
|
|
|
|
def process_text(text):
|
|
filter_list = ['\n', '\t', '\r', '\b', '\f', '\v', ':', '的', '或', '10', '天', '了', '可', '是', '该', ',', ' ', '、', '让', '和', '集']
|
|
|
|
# 使用 jieba 进行分词
|
|
text_list = jieba.lcut(text)
|
|
|
|
# 过滤掉包含 filter_list 中任何字符的元素
|
|
results = []
|
|
for tl in text_list:
|
|
# 检查当前元素是否包含 filter_list 中的任何字符
|
|
should_include = True
|
|
for fl in filter_list:
|
|
if fl in tl:
|
|
should_include = False
|
|
break
|
|
|
|
# 如果不包含任何 filter_list 中的字符,则添加到结果
|
|
if should_include:
|
|
results.append(tl)
|
|
|
|
print(list(set(results))) |