一、全套英文预处理代码
from nltk.corpus import stopwords as pw
import sys 
import re
cacheStopWords=pw.words("english")
def English_processing(sentence):
    if sentence:
        sentence = sentence.lower()  
           
        for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
            sentence = sentence.lower().replace(ch," ")  
            
        sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) 
        
        sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) 
        
        return sentence
复制代码二、统计词频+词云图分析
统计词频
def concat_sentence(sen_list):   
    all_sen = ""
    for i in sen_list:
        all_sen+=' '
        all_sen+=str(i)
    return all_sen
def compute_word_fre(sentence):  
    if sentence:
        word_fre ={}
        sentence = English_processing(sentence)
        
        words = sentence.strip().split() 
        
        for word in words :
            word_fre[word]=word_fre.get(word,0)+1
        
        return word_fre
    
def output(word_fre): 
    if word_fre:
        sort_word = sorted(word_fre.items(),key =lambda s:s[1],reverse = True)
        return sort_word
复制代码词云图分析
sentences_list 是句子列表输入
concat_sentence 将句子拼接起来,形成一篇文章,再进行统计词频
如果直接是一篇文章,可以跳过第一行的代码
all_sen = concat_sentence(sentences_list) 
word_fre = compute_word_fre(all_sen)
import matplotlib.pyplot as plt
from imageio import imread,imsave
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
wordcloud = WordCloud(background_color='white',collocations=False,mask=imread('cloud.png',pilmode="RGB"),
    max_words=30,random_state=2021,width=1200, height=800).fit_words(word_fre)
plt.imshow(wordcloud, interpolation='bilinear')
wordcloud.to_file("wordcloud.png")
复制代码三、情感分析
NLTK
NLTK,全称Natural Language Toolkit,自然语言处理工具包,是NLP研究领域常用的一个Python库,由宾夕法尼亚大学的Steven Bird和Edward Loper在Python的基础上开发的一个模块,至今已有超过十万行的代码。这是一个开源项目,包含数据集、Python模块、教程等;
情感分析实战
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sentences = ['Hello, world. I am terrible']
for sentence in sentences: 
    print(sentence)
    point = sia.polarity_scores(sentence)
    print(point)
for k in sorted(point): print('{0}: {1}, '.format(k, point[k]), end='')
复制代码四、相似度分析(LDA、LSI、Tfidf)
全套代码
from nltk.corpus import stopwords as pw
import sys 
import re
cacheStopWords=pw.words("english")
def English_processing(sentence):
    if sentence:
        sentence = sentence.lower()  
           
        for ch in "“”!?.\;'',()<>\{}/-1234567890$&#%~":
            sentence = sentence.lower().replace(ch," ")  
            
        sentence=''.join([word+" " for word in sentence.split() if word not in cacheStopWords]) 
        
        sentence=''.join([word+" " for word in sentence.split() if word not in ['br','w','b','bc']]) 
        
        return sentence
    
def concat_sentence(sen_list):   
    all_sen = ""
    for i in sen_list:
        all_sen+=' '
        all_sen+=str(i)
    return all_sen
import gc
import tqdm
import numpy as np
from gensim import corpora, models, similarities
from collections import defaultdict
import time
class SentenceSimilarity():
    def __init__(self, sentences,min_frequency= 1):
        self.sentences = []
        for i in range(0, len(sentences)):
            self.sentences.append(English_processing(sentences[i]))
        self.sentences_num = len(self.sentences)
        
        self.min_frequency = min_frequency
    
    def get_cuted_sentences(self):
        cuted_sentences = []
        for sentence in self.sentences:
            cuted_sentences.append(sentence.strip().split())
        return cuted_sentences
    
    def simple_model(self):
        self.texts = self.get_cuted_sentences()
        
        
        frequency = defaultdict(int)
        for text in self.texts:
            for token in text:
                frequency[token] += 1
        self.texts = [[token for token in text if frequency[token] > self.min_frequency] for text in self.texts]
        self.dictionary = corpora.Dictionary(self.texts)
        
        self.corpus_simple = [self.dictionary.doc2bow(text) for text in self.texts]
    
    def TfidfModel(self):
        self.simple_model()
        
        self.model = models.TfidfModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]
        
        self.index = similarities.MatrixSimilarity(self.corpus)
    
    def LsiModel(self):
        self.simple_model()
        
        self.model = models.LsiModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]
        
        self.index = similarities.MatrixSimilarity(self.corpus)
    
    def LdaModel(self):
        self.simple_model()
        
        self.model = models.LdaModel(self.corpus_simple)
        self.corpus = self.model[self.corpus_simple]
        
        self.index = similarities.MatrixSimilarity(self.corpus)
    
    def sentence2vec(self, sentence):
        sentence = English_processing(sentence)
        vec_bow = self.dictionary.doc2bow(sentence.strip().split())
        return self.model[vec_bow]
    def bow2vec(self):
        vec = []
        length = max(self.dictionary) + 1
        for content in self.corpus:
            sentence_vectors = np.zeros(length)
            for co in content:
                sentence_vectors[co[0]] = co[1]  
            vec.append(sentence_vectors)
        return vec
    
    
    def similarity(self, sentence):
        sentence_vec = self.sentence2vec(sentence)
        sims = self.index[sentence_vec]
        sim = max(enumerate(sims), key=lambda item: item[1])
        index = sim[0]
        score = sim[1]
        sentence = self.sentences[index]
        return index,score  
        
    def similarity_k(self, sentence, k):
        sentence_vec = self.sentence2vec(sentence)
        t1 = time.time()
        sims = self.index[sentence_vec]
        t2 = time.time()
        print('特征检索耗时:{:.4f}ms, 检索样本总数:{}'.format(t2-t1, self.sentences_num))
        sim_k = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)[:k]
        indexs = [i[0] for i in sim_k]
        scores = [i[1] for i in sim_k]
        return indexs, scores
复制代码本文使用 文章同步助手 同步
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END
    























![[桜井宁宁]COS和泉纱雾超可爱写真福利集-一一网](https://www.proyy.com/skycj/data/images/2020-12-13/4d3cf227a85d7e79f5d6b4efb6bde3e8.jpg)

![[桜井宁宁] 爆乳奶牛少女cos写真-一一网](https://www.proyy.com/skycj/data/images/2020-12-13/d40483e126fcf567894e89c65eaca655.jpg)
