• 文本挖掘与NLP笔记——代码向:分词


    分词:jieba.cut

    words = jieba.cut("我来到北京大学",cut_all=True)
    print('全模式:'+'/'.join([w for w in words])) #全模式
    
    words = jieba.cut("我来到北京大学",cut_all=False)
    print('精确模式:'+'/'.join([w for w in words])) #精确模式,默认
    
    words = jieba.cut_for_search("小明毕业于北京大学,后在美国哈佛大学深造")
    print('/'.join([w for w in words])) #搜索引擎模式,在精确模式的基础上,对长词在此划分
    

    全模式:我/来到/北京/北京大学/大学
    精确模式:我/来到/北京大学
    小明/毕业/于/北京/大学/北京大学/,/后/在/美国/哈佛/大学/美国哈佛大学/深造

    请练习添加自定义词典

    词性:jieba.posseg

    import jieba.posseg as pg
    
    for word, flag in pg.cut("你想去学校填写学生寒暑假住校申请表吗?"):
        print('%s %s' % (word, flag))
    

    '你/学校/填写/学生/寒暑假/住校/申请表'

    分词引入停用词

    import jieba
    import pandas as pd
    import numpy as np
    
    paths = '中英文停用词.xlsx'
    dfs = pd.read_excel(paths,dtype=str)
    
    stopwords = ['想','去','吗','?']
    
    words = jieba.cut("你想去学校填写学生寒暑假住校申请表吗?")
    '/'.join([w for w in words if (w not in stopwords)])#此处’/'表示换行
    

    '你/学校/填写/学生/寒暑假/住校/申请表'

    txt转dataframe函数

    import random
    import jieba.posseg as pg
    import pandas as pd
    import numpy as np
    
    def generatorInfo(file_name):
        # 读取文本文件
        with open(file_name, encoding='utf-8') as file:
            line_list = [k.strip() for k in file.readlines()]
            data = []
            for k in random.sample(line_list,1000):
                t = k.split(maxsplit=1)
                #data_label_list.append(t[0])
                #data_content_list.append(t[1])
                data.append([t[0],' '.join([w for w,flag in pg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (len(w)>=2)])])
        return data
    
    file_name = 'cnews.train.txt'
    df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])
    path = '训练集分词结果(随机选取1000个样本).xlsx'
    df.to_excel(path,index=False)
    df
    
    image-20221027180826071

    词云图:wordcloud

    %pylab inline
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud
    
    text = ' '.join(list(df['分词']))
    wcloud = WordCloud(
        font_path='simsun.ttc', #字体路径
        background_color='white', #指定背景颜色
        max_words=500,   #词云显示最大词数
        max_font_size=150,  #指定最大字号
        #mask = mask #背景图片
    ) 
    
    wcloud = wcloud.generate(text)  #生成词云
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()
    
    image-20221027181109493

    提取关键词:jieba.analyse.extract_tags

    import jieba.analyse
    import pandas as pd
    import numpy as np
    
    path = '训练集分词结果(随机选取1000个样本).xlsx'
    df = pd.read_excel(path,dtype=str)
    s = ' '.join(list(df['分词']))
    for w,x in jieba.analyse.extract_tags(s,withWeight=True):
        print('%s %s' % (w,x))
    
    image-20221027182149499
    请练习基于TextRank算法抽取关键词
    import jieba.analyse
    import pandas as pd
    import numpy as np
    
    path = '训练集分词结果(随机选取1000个样本).xlsx'
    df = pd.read_excel(path,dtype=str)
    tag = list(set(list(df['类别'])))
    
    for t in tag:
        s = ' '.join(list(df[df['类别']==t]['分词']))
        print(t)
        for w,x in jieba.analyse.extract_tags(s,withWeight=True):
            print('%s %s' % (x,w))
        
    
    image-20221027182548776

    构建词向量

    构建词向量简单的有两种分别是TfidfTransformer和 CountVectorizer

    #CountVectorizer会将文本中的词语转换为词频矩阵
    from sklearn.feature_extraction.text import CountVectorizer
    path = '训练集分词结果(随机选取1000个样本).xlsx'
    df = pd.read_excel(path,dtype=str)
    corpus = df['分词']
    #vectorizer = CountVectorizer(max_features=5000) 
    vectorizer = CountVectorizer() 
    X = vectorizer.fit_transform(corpus)
    print(X)
    
    image-20221027183001731
    from sklearn.feature_extraction.text import TfidfTransformer
    import datetime
    
    starttime = datetime.datetime.now()
    transformer = TfidfTransformer()  
    tfidf = transformer.fit_transform(X)
    word = vectorizer.get_feature_names()  
    weight = tfidf.toarray() 
    print(weight)
    
    image-20221027183320162

    词语分类:人工vsKmeans

    from sklearn.cluster import KMeans
    
    starttime = datetime.datetime.now()
    
    path = '训练集分词结果(随机选取1000个样本).xlsx'
    df = pd.read_excel(path,dtype=str)
    corpus = df['分词']
    
    kmeans=KMeans(n_clusters=10)   #n_clusters:number of cluster  
    kmeans.fit(weight)
    res = [list(df['类别']),list(kmeans.labels_)]
    df_res = pd.DataFrame(np.array(res).T,columns=['人工分类','Kmeans分类'])
    path_res = 'Kmeans自动分类结果.xlsx'
    df_res.to_excel(path_res,index=False)
    df_res
    
    image-20221027184227663
    path = 'Kmeans自动分类结果.xlsx'
    df = pd.read_excel(path,dtype=str)
    
    df['计数'] = [1 for m in range(len(df['人工分类']))]
    df1 = pd.pivot_table(df, index=['人工分类'], columns=['Kmeans分类'], values=['计数'], aggfunc=np.sum, fill_value=0)
    co = ['人工分类']
    co.extend(list(df1['计数'].columns))
    df1 = df1.reset_index()
    df2 = pd.DataFrame((np.array(df1)),columns=co)
    
    path_res = '人工与Kmeans分类结果对照.xlsx'
    df2.to_excel(path_res,index=False)
    
    df2
    
    image-20221027184308183
    import random
    
    def is_contain_chinese(check_str):
        for ch in check_str:
            if u'\u4e00' <= ch <= u'\u9fff':
                return 1
        return 0
    
    def generatorInfo(file_name):
        """
        batch_size:生成数据的batch size
        seq_length:输入文字序列长度
        num_classes:文本的类别数
        file_name:读取文件的路径
        """
        
        # 读取文本文件
        with open(file_name, encoding='utf-8') as file:
            line_list = [k.strip() for k in file.readlines()]
            #data_label_list = []   # 创建数据标签文件
            #data_content_list = []   # 创建数据文本文件
            data = []
            for k in random.sample(line_list,1000):
                t = k.split(maxsplit=1)
                #data_label_list.append(t[0])
                #data_content_list.append(t[1])
                data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])
                
        return data
    
    #导入中文停用词表
    paths = '中英文停用词.xlsx'
    dfs = pd.read_excel(paths,dtype=str)
    
    file_name = 'cnews.train.txt'
    df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])
    df
    
    image-20221027184449074

    汇总

    import random
    import jieba
    import pandas as pd
    import numpy as np
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.cluster import KMeans
    from sklearn.feature_extraction.text import TfidfTransformer
    
    def is_contain_chinese(check_str):
        for ch in check_str:
            if u'\u4e00' <= ch <= u'\u9fff':
                return 1
        return 0
    
    def generatorInfo(file_name):
        """
        batch_size:生成数据的batch size
        seq_length:输入文字序列长度
        num_classes:文本的类别数
        file_name:读取文件的路径
        """
        
        # 读取文本文件
        with open(file_name, encoding='utf-8') as file:
            line_list = [k.strip() for k in file.readlines()]
            #data_label_list = []   # 创建数据标签文件
            #data_content_list = []   # 创建数据文本文件
            data = []
            for k in random.sample(line_list,1000):
                t = k.split(maxsplit=1)
                #data_label_list.append(t[0])
                #data_content_list.append(t[1])
                data.append([t[0],' '.join([w for w,flag in jieba.posseg.cut(t[1]) if (w not in dfs['stopwords']) and (w !=' ') and (flag not in ["nr","ns","nt","nz","m","f","ul","l","r","t"]) and (len(w)>=2)])])
                
        return data
    
    #导入中文停用词表
    paths = '中英文停用词.xlsx'
    dfs = pd.read_excel(paths,dtype=str)
    
    file_name = 'cnews.train.txt'
    df = pd.DataFrame(np.array(generatorInfo(file_name)),columns=['类别','分词'])
    
    #统计词频
    corpus = df['分词'] #语料中的单词以空格隔开
    #vectorizer = CountVectorizer(max_features=5000) 
    vectorizer = CountVectorizer() 
    X = vectorizer.fit_transform(corpus)
    
    #文本向量化
    transformer = TfidfTransformer()  
    tfidf = transformer.fit_transform(X)
    word = vectorizer.get_feature_names()  
    weight = tfidf.toarray()
    
    kmeans=KMeans(n_clusters=10)   #n_clusters:number of cluster  
    kmeans.fit(weight)
    
    res = [list(df['类别']),list(kmeans.labels_)]
    df_res = pd.DataFrame(np.array(res).T,columns=['人工分类','Kmeans分类'])
    
    df_res['计数'] = [1 for m in range(len(df_res['人工分类']))]
    df1 = pd.pivot_table(df_res, index=['人工分类'], columns=['Kmeans分类'], values=['计数'], aggfunc=np.sum, fill_value=0)
    co = ['人工分类']
    co.extend(list(df1['计数'].columns))
    df1 = df1.reset_index()
    df2 = pd.DataFrame((np.array(df1)),columns=co)
    df2
    
    image-20221027184613552
    df['Kmeans分类'] = df_res['Kmeans分类']
    df
    
    image-20221027184700276
  • 相关阅读:
    泰坦尼克号幸存者数据分析
    StoneDB 亮相中国信通院OSCAR开源产业大会,石原子科技正式加入科技制造开源社区!
    好心情精神心理科医生:抗抑郁药物是否成瘾?
    awk运用三维数组进行插值获得任意经纬度处的水层沉积层地壳厚度
    异步复位同步释放与同步复位打拍
    k8s卷管理-2
    ChatGPT Edu版本来啦:支持GPT-4o、自定义GPT、数据分析等
    2024-02-27(Kafka)
    【算法】String Compression from Character Array
    python中的常用函数介绍
  • 原文地址:https://www.cnblogs.com/ranxi169/p/16833336.html