
文章插图
path = 'Kmeans自动分类结果.xlsx'df = pd.read_excel(path,dtype=str)df['计数'] = [1 for m in range(len(df['人工分类']))]df1 = pd.pivot_table(df, index=['人工分类'], columns=['Kmeans分类'], values=['计数'], aggfunc=np.sum, fill_value=https://www.huyubaike.com/biancheng/0)co = ['人工分类']co.extend(list(df1['计数'].columns))df1 = df1.reset_index()df2 = pd.DataFrame((np.array(df1)),columns=co)path_res = '人工与Kmeans分类结果对照.xlsx'df2.to_excel(path_res,index=False)df2

文章插图
import randomdef is_contain_chinese(check_str):for ch in check_str:if u'\u4e00' <= ch <= u'\u9fff':return 1return 0def generatorInfo(file_name):"""batch_size:生成数据的batch sizeseq_length:输入文字序列长度num_classes:文本的类别数file_name:读取文件的路径"""# 读取文本文件with open(file_name, encoding='utf-8') as file:line_list = [k.strip() for k in file.readlines()]#data_label_list = []# 创建数据标签文件#data_content_list = []# 创建数据文本文件data = https://www.huyubaike.com/biancheng/[]for k in random.sample(line_list,1000):t = k.split(maxsplit=1)#data_label_list.append(t[0])#data_content_list.append(t[1])data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])return data#导入中文停用词表paths = '中英文停用词.xlsx'dfs = pd.read_excel(paths,dtype=str)file_name = 'cnews.train.txt'df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])df

文章插图
汇总
import randomimport jiebaimport pandas as pdimport numpy as npfrom sklearn.feature_extraction.text import CountVectorizerfrom sklearn.cluster import KMeansfrom sklearn.feature_extraction.text import TfidfTransformerdef is_contain_chinese(check_str):for ch in check_str:if u'\u4e00' <= ch <= u'\u9fff':return 1return 0def generatorInfo(file_name):"""batch_size:生成数据的batch sizeseq_length:输入文字序列长度num_classes:文本的类别数file_name:读取文件的路径"""# 读取文本文件with open(file_name, encoding='utf-8') as file:line_list = [k.strip() for k in file.readlines()]#data_label_list = []# 创建数据标签文件#data_content_list = []# 创建数据文本文件data = https://www.huyubaike.com/biancheng/[]for k in random.sample(line_list,1000):t = k.split(maxsplit=1)#data_label_list.append(t[0])#data_content_list.append(t[1])data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])return data#导入中文停用词表paths = '中英文停用词.xlsx'dfs = pd.read_excel(paths,dtype=str)file_name = 'cnews.train.txt'df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])#统计词频corpus = df['分词'] #语料中的单词以空格隔开#vectorizer = CountVectorizer(max_features=5000)vectorizer = CountVectorizer()X = vectorizer.fit_transform(corpus)#文本向量化transformer = TfidfTransformer()tfidf = transformer.fit_transform(X)word = vectorizer.get_feature_names()weight = tfidf.toarray()kmeans=KMeans(n_clusters=10)#n_clusters:number of clusterkmeans.fit(weight)res = [list(df['类别']),list(kmeans.labels_)]df_res = pd.DataFrame(np.array(res).T,columns=['人工分类','Kmeans分类'])df_res['计数'] = [1 for m in range(len(df_res['人工分类']))]df1 = pd.pivot_table(df_res, index=['人工分类'], columns=['Kmeans分类'], values=['计数'], aggfunc=np.sum, fill_value=https://www.huyubaike.com/biancheng/0)co = ['人工分类']co.extend(list(df1['计数'].columns))df1 = df1.reset_index()df2 = pd.DataFrame((np.array(df1)),columns=co)df2

文章插图
df['Kmeans分类'] = df_res['Kmeans分类']df

文章插图
【文本挖掘与NLP笔记——代码向:分词】
推荐阅读
- 《上传那些事儿之Nest与Koa》——文件格式怎么了!
- 创造与魔法9月16日最新礼包兑换码是什么
- 光与夜之恋晴空漫野谈活动怎么玩
- 前端监控系列4 | SDK 体积与性能优化实践
- 其三 Gitea 1.18 功能前瞻:增强文本预览效果、继续扩展软件包注册中心、增强工单实用功能、完善了用户邀请机制和SEO
- 消息队列之RabbitMQ介绍与运用
- 奇迹暖暖晚钟与祈祷怎么搭配才能高分
- 电视猫和路由器怎么连接(路由器与电视盒子连接)
- 路由器与猫怎样正确链接(一个猫怎么连接三个路由器)
- java中的垃圾回收算法与垃圾回收器