糖尿病康复 > 电影评论情感分析-IMDB数据集 | python 利用朴素贝叶斯神经网络模型

电影评论情感分析-IMDB数据集 | python 利用朴素贝叶斯神经网络模型

时间：2021-04-07 16:34:16

载入包

import torch # torch==1.7.1import torch.nn as nnfrom torch.utils.data import Datasetfrom torch.utils.data import DataLoaderimport osimport reimport numpy as npfrom tqdm import tqdmdevice = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')MAX_WORD = 10000 # 只保留最高频的10000词MAX_LEN = 300# 句子统一长度为200word_count={}# 词-词出现的词数词典

数据处理

#读取数据集

import numpy as npimport pandas as pdimport warningswarnings.filterwarnings('ignore')data = pd.read_csv('./data/labeledTrainData.tsv',header=0, delimiter="\t", quoting=3)print('dataset shape is', data.shape)

#数据清洗

#去除网页符号from bs4 import BeautifulSoupexample = BeautifulSoup(data['review'][0])print(example.get_text())#去除非字母元素import reletters_only = re.sub('[^A-Za-z]', ' ', example.get_text())print(letters_only)#将大写字母转化成小写，并对元素进行划分lower_case = letters_only.lower()words = lower_case.split()print(words)

#获取停用词

# import nltk# nltk.download('stopwords')def get_custom_stopwords(stop_words_file):with open(stop_words_file,encoding='utf-8') as f:stopwords = f.read()stopwords_list = stopwords.split('\n')custom_stopwords_list = [i for i in stopwords_list]return custom_stopwords_liststop_words_file = 'english.txt'stopwords = get_custom_stopwords(stop_words_file)words = [word for word in words if word not in stopwords]' '.join(words)

#打包成数据清洗函数

from bs4 import BeautifulSoup#导入正则表达式工具包# import re# from nltk.corpus import stopwords#定义review_to_text函数，完成对原始评论的三项数据预处理任务def review_to_text(review):#任务一：去掉html标记。raw_text = BeautifulSoup(review,'html').get_text()#任务二：去掉非字母字符,sub(pattern, replacement, string) 用空格代替letters = re.sub('[^a-zA-Z]',' ',raw_text)#str.split(str="", num=string.count(str)) 通过指定分隔符对字符串进行切片，如果参数 num 有指定值，则仅分隔 num 个子字符串#这里是先将句子转成小写字母表示，再按照空格划分为单词listwords = letters.lower().split()return words

#分别对原始数据和测试数据集进行上述三项处理

X_data = []y_data=[]for review in data['review']:X_data.append(' '.join(review_to_text(review)))for sentiment in data['sentiment']:y_data.append(sentiment)## y_data = data['sentiment']# print(X_data,y_data)

#对数据集进行拆分

from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=45)

利用传统机器学习模型，朴素贝叶斯

#向量表示，和对数据进行学习，利用朴素贝叶斯分类器

from sklearn.feature_extraction.text import CountVectorizer#5000的含义向量最大长度为5000，选取次数最多的5000个单词作为向量下标vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)train_data_features = vectorizer.fit_transform(X_train)t_data_features = vectorizer.transform(X_test)from sklearn.naive_bayes import MultinomialNBnb = MultinomialNB()nb.fit(train_data_features,y_train)print(nb.score(train_data_features, y_train))print(nb.score(t_data_features, y_test))# #预测# pre_str="Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty"# pre_str_list=[(' '.join(review_to_text(pre_str)))]# pre_data = vectorizer.transform(pd.Series(pre_str_list))# result = nb.predict(pre_data)# print(result)训练集精度为0.86，测试集精度为0.84#output0.861450.8498[1]

利用神经模型LSTM/GRU进行数据学习、分类、预测

#将处理好的训练数据和测试数据写入新的train.txt和test.txt，便于使用dataset读取数据

with open("train.txt","w",encoding="utf-8") as f:for i in range(len(X_train)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f.write(str(y_train[i])+" "+X_train[i]+"\n")f.close()with open("test.txt","w",encoding="utf-8") as f1:for i in range(len(X_test)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f1.write(str(y_test[i])+" "+X_test[i]+"\n")f1.close()

#将英文句子切成单词，并统计词频，生成词典

def tokenizer(sentence):return sentence.split()def data_process(text): for line in text:tokens = tokenizer(line) # 分词统计词数for token in tokens:if token in word_count.keys():word_count[token] = word_count[token] + 1else:word_count[token] = 0print("build vocabulary")vocab = {"<UNK>": 0, "<PAD>": 1}word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 对词进行排序，过滤低频词，只取前MAX_WORD个高频词word_number = 1for word in word_count_sort:if word[0] not in vocab.keys():vocab[word[0]] = len(vocab)word_number += 1if word_number > MAX_WORD:breakreturn vocab

#建立词典

vocab=data_process(X_train)# print(vocab)

#GRU模型构建，如果要换成LSTM，把nn.GRU换成nn.LSTM即可

class GRU(nn.Module):def __init__(self, vocab, embed_size, num_hiddens, num_layers):super(GRU, self).__init__()self.embedding = nn.Embedding(len(vocab), embed_size) # embedding层self.encoder = nn.LSTM(input_size=embed_size,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)self.decoder = nn.Linear(num_hiddens, 2)self.softmax = nn.Softmax(dim=1)def forward(self, inputs):# inputs的形状是（批量大小，词数），因此LSTM需要将序列长度（Seq_len）作为第一维，所以将输入转置后再提取词特征embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交换维度# LSTM只传入输入embeddings,因此只返回最后一层的隐藏层再各时间步的隐藏状态# outputs的形状是（词数，批量大小，隐藏单元个数）outputs, _ = self.encoder(embeddings)# 连接初时间步和最终时间步的隐藏状态作为全连接层的输入。形状为(批量大小，隐藏单元个数)encoding = outputs[-1] # 取LSTM最后一层结果outs = self.softmax(self.decoder(encoding)) # 输出层为二维概率[a,b]return outs

#文本向量转化

def text_transform(sentence_list, vocab):sentence_index_list = []for sentence in sentence_list:sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分词转为idif len(sentence_idx) < MAX_LEN:for i in range(MAX_LEN-len(sentence_idx)): # 对长度不够的句子进行PAD填充sentence_idx.append(vocab['<PAD>'])sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN长度sentence_index_list.append(sentence_idx)return torch.LongTensor(sentence_index_list) # 将转为idx的词转为tensor

#模型训练

def train(model, train_data, vocab, epoch=10):print('train model')model = model.to(device)loss_sigma = 0.0correct = 0.0# 定义损失函数和优化器criterion = torch.nn.NLLLoss()optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)for epoch in tqdm(range(epoch)):model.train()avg_loss = 0 # 平均损失avg_acc = 0 # 平均准确率for idx, (text, label) in enumerate(tqdm(train_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)optimizer.zero_grad()pred = model(train_x)loss = criterion(pred.log(), train_y)loss.backward()optimizer.step()avg_loss += loss.item()avg_acc += accuracy(pred, train_y)# 一个epoch结束后，计算平均loss和评平均accavg_loss = avg_loss / len(train_data)avg_acc = avg_acc / len(train_data)print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)# 保存训练完成后的模型参数torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')

#设计数据格式

class MyDataset(Dataset):def __init__(self, text_path):file = open(text_path, 'r', encoding='utf-8')self.text_with_tag = file.readlines() # 文本标签与内容file.close()def __getitem__(self, index): # 重写getitemline = self.text_with_tag[index] # 获取一个样本的标签和文本信息label = int(line[0]) # 标签信息text = line[2:-1] # 文本信息return text, labeldef __len__(self):return len(self.text_with_tag)

#模型测试

def tst(model, test_data, vocab):print('test model')model = model.to(device)model.eval()avg_acc = 0for idx, (text, label) in enumerate(tqdm(test_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)pred = model(train_x)avg_acc += accuracy(pred, train_y)avg_acc = avg_acc / len(test_data)return avg_acc

#计算预测准确性

def accuracy(y_pred, y_true):label_pred = y_pred.max(dim=1)[1]acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正确的个数return acc.detach().cpu().numpy() / len(y_pred)

#main函数

def main():vocab = data_process(X_train)np.save('vocab.npy', vocab) # 词典保存为本地vocab = np.load('vocab.npy', allow_pickle=True).item() # 加载本地已经存储的vocab# 构建MyDataset实例train_data = MyDataset(text_path="./train.txt")test_data = MyDataset(text_path="./test.txt")# 构建DataLodertrain_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True)test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)# 生成模型model = GRU(vocab=vocab, embed_size=300,num_hiddens=256, num_layers=3) # 定义模型train(model=model, train_data=train_loader, vocab=vocab, epoch=30)# 加载训练好的模型model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location= torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')))# 测试结果acc = tst(model=model, test_data=test_loader, vocab=vocab)print(acc)

#执行

if __name__ == '__main__':main()

结果

参考：

情感分析-IMDB数据集

pytorch构建LSTM分类器用于IMDB情感分类

数据集：

文中涉及到的数据集和停用词表

链接：/s/1OTgLDoE1P9_FPDQaLU1VKw

提取码：mz6p

全部源码

import numpy as npimport pandas as pdimport warningswarnings.filterwarnings('ignore')import torch # torch==1.7.1import torch.nn as nnfrom torch.utils.data import Datasetfrom torch.utils.data import DataLoaderimport osimport reimport numpy as npfrom tqdm import tqdmdevice = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')MAX_WORD = 10000 # 只保留最高频的10000词MAX_LEN = 300# 句子统一长度为200word_count={}# 词-词出现的词数词典data = pd.read_csv('./data/labeledTrainData.tsv',header=0, delimiter="\t", quoting=3)print('dataset shape is', data.shape)from bs4 import BeautifulSoupexample = BeautifulSoup(data['review'][0])print(example.get_text())import reletters_only = re.sub('[^A-Za-z]', ' ', example.get_text())print(letters_only)lower_case = letters_only.lower()words = lower_case.split()print(words)# import nltk# nltk.download('stopwords')def get_custom_stopwords(stop_words_file):with open(stop_words_file,encoding='utf-8') as f:stopwords = f.read()stopwords_list = stopwords.split('\n')custom_stopwords_list = [i for i in stopwords_list]return custom_stopwords_liststop_words_file = 'english.txt'stopwords = get_custom_stopwords(stop_words_file)words = [word for word in words if word not in stopwords]' '.join(words)from bs4 import BeautifulSoup#导入正则表达式工具包# import re# from nltk.corpus import stopwords#定义review_to_text函数，完成对原始评论的三项数据预处理任务def review_to_text(review):#任务一：去掉html标记。raw_text = BeautifulSoup(review,'html').get_text()#任务二：去掉非字母字符,sub(pattern, replacement, string) 用空格代替letters = re.sub('[^a-zA-Z]',' ',raw_text)#str.split(str="", num=string.count(str)) 通过指定分隔符对字符串进行切片，如果参数 num 有指定值，则仅分隔 num 个子字符串#这里是先将句子转成小写字母表示，再按照空格划分为单词listwords = letters.lower().split()#过滤掉停用词# words = [w for w in words if w not in stopwords]# words = [w for w in words if w not in stopwords.words()]return words#分别对原始数据和测试数据集进行上述三项处理X_data = []y_data=[]for review in data['review']:X_data.append(' '.join(review_to_text(review)))for sentiment in data['sentiment']:y_data.append(sentiment)## y_data = data['sentiment']# print(X_data,y_data)from sklearn.model_selection import train_test_splitX_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=45)print(type(y_train))with open("train.txt","w",encoding="utf-8") as f:for i in range(len(X_train)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f.write(str(y_train[i])+" "+X_train[i]+"\n")f.close()with open("test.txt","w",encoding="utf-8") as f1:for i in range(len(X_test)):# print(type(y_train[i]))# print(y_train[i])# print(type(X_train[i]))# print(type(X_train[i]))f1.write(str(y_test[i])+" "+X_test[i]+"\n")f1.close()# print(X_train)# from sklearn.feature_extraction.text import CountVectorizer# #5000的含义向量最大长度为5000，选取次数最多的5000个单词作为向量下标# vectorizer = CountVectorizer(analyzer = 'word', tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)# train_data_features = vectorizer.fit_transform(X_train)# t_data_features = vectorizer.transform(X_test)## from sklearn.naive_bayes import MultinomialNB# nb = MultinomialNB()# nb.fit(train_data_features,y_train)# print(nb.score(train_data_features, y_train))# print(nb.score(t_data_features, y_test))## #预测# pre_str="Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty"# pre_str_list=[(' '.join(review_to_text(pre_str)))]# pre_data = vectorizer.transform(pd.Series(pre_str_list))# result = nb.predict(pre_data)# print(result)def tokenizer(sentence):return sentence.split()def data_process(text): # 根据文本路径生成文本的标签for line in text:tokens = tokenizer(line) # 分词统计词数for token in tokens:if token in word_count.keys():word_count[token] = word_count[token] + 1else:word_count[token] = 0print("build vocabulary")vocab = {"<UNK>": 0, "<PAD>": 1}word_count_sort = sorted(word_count.items(), key=lambda item : item[1], reverse=True) # 对词进行排序，过滤低频词，只取前MAX_WORD个高频词word_number = 1for word in word_count_sort:if word[0] not in vocab.keys():vocab[word[0]] = len(vocab)word_number += 1if word_number > MAX_WORD:breakreturn vocabvocab=data_process(X_train)# print(vocab)class GRU(nn.Module):def __init__(self, vocab, embed_size, num_hiddens, num_layers):super(GRU, self).__init__()self.embedding = nn.Embedding(len(vocab), embed_size) # embedding层## self.encoder=nn.LSTM(input_size=embed_size# ,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)self.encoder = nn.LSTM(input_size=embed_size,hidden_size=num_hiddens,num_layers=num_layers,bidirectional=False)self.decoder = nn.Linear(num_hiddens, 2)self.softmax = nn.Softmax(dim=1)def forward(self, inputs):# inputs的形状是（批量大小，词数），因此LSTM需要将序列长度（Seq_len）作为第一维，所以将输入转置后再提取词特征embeddings = self.embedding(inputs.permute(1,0)) # permute(1,0)交换维度# LSTM只传入输入embeddings,因此只返回最后一层的隐藏层再各时间步的隐藏状态# outputs的形状是（词数，批量大小，隐藏单元个数）outputs, _ = self.encoder(embeddings)# 连接初时间步和最终时间步的隐藏状态作为全连接层的输入。形状为(批量大小，隐藏单元个数)encoding = outputs[-1] # 取LSTM最后一层结果outs = self.softmax(self.decoder(encoding)) # 输出层为二维概率[a,b]return outsdef text_transform(sentence_list, vocab):sentence_index_list = []for sentence in sentence_list:sentence_idx = [vocab[token] if token in vocab.keys() else vocab['<UNK>'] for token in tokenizer(sentence)] # 句子分词转为idif len(sentence_idx) < MAX_LEN:for i in range(MAX_LEN-len(sentence_idx)): # 对长度不够的句子进行PAD填充sentence_idx.append(vocab['<PAD>'])sentence_idx = sentence_idx[:MAX_LEN] # 取前MAX_LEN长度sentence_index_list.append(sentence_idx)return torch.LongTensor(sentence_index_list) # 将转为idx的词转为tensor# 模型训练def train(model, train_data, vocab, epoch=10):print('train model')model = model.to(device)loss_sigma = 0.0correct = 0.0# 定义损失函数和优化器criterion = torch.nn.NLLLoss()optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)for epoch in tqdm(range(epoch)):model.train()avg_loss = 0 # 平均损失avg_acc = 0 # 平均准确率for idx, (text, label) in enumerate(tqdm(train_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)optimizer.zero_grad()pred = model(train_x)loss = criterion(pred.log(), train_y)loss.backward()optimizer.step()avg_loss += loss.item()avg_acc += accuracy(pred, train_y)# 一个epoch结束后，计算平均loss和评平均accavg_loss = avg_loss / len(train_data)avg_acc = avg_acc / len(train_data)print("avg_loss:", avg_loss, " train_avg_acc:,", avg_acc)# 保存训练完成后的模型参数torch.save(model.state_dict(), 'LSTM_IMDB_parameter.pkl')class MyDataset(Dataset):def __init__(self, text_path):file = open(text_path, 'r', encoding='utf-8')self.text_with_tag = file.readlines() # 文本标签与内容file.close()def __getitem__(self, index): # 重写getitemline = self.text_with_tag[index] # 获取一个样本的标签和文本信息label = int(line[0]) # 标签信息text = line[2:-1] # 文本信息return text, labeldef __len__(self):return len(self.text_with_tag)# 模型测试def tst(model, test_data, vocab):print('test model')model = model.to(device)model.eval()avg_acc = 0for idx, (text, label) in enumerate(tqdm(test_data)):train_x = text_transform(text, vocab).to(device)train_y = label.to(device)pred = model(train_x)avg_acc += accuracy(pred, train_y)avg_acc = avg_acc / len(test_data)return avg_acc# 计算预测准确性def accuracy(y_pred, y_true):label_pred = y_pred.max(dim=1)[1]acc = len(y_pred) - torch.sum(torch.abs(label_pred-y_true)) # 正确的个数return acc.detach().cpu().numpy() / len(y_pred)from mxnet.gluon import data as gdatadef main():vocab = data_process(X_train)np.save('vocab.npy', vocab) # 词典保存为本地vocab = np.load('vocab.npy', allow_pickle=True).item() # 加载本地已经存储的vocab# 构建MyDataset实例# train_data = X_train# test_data = X_testtrain_data = MyDataset(text_path="./train.txt")test_data = MyDataset(text_path="./test.txt")# 构建DataLoder# train_data = GetLoader(X_train, y_train)# # test_data=GetLoader(X_test,y_test)# train_data = gdata.ArrayDataset(X_train, y_train)train_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True)test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=False)# 生成模型model = GRU(vocab=vocab, embed_size=300,num_hiddens=256, num_layers=3) # 定义模型train(model=model, train_data=train_loader, vocab=vocab, epoch=30)# 加载训练好的模型model.load_state_dict(torch.load('LSTM_IMDB_parameter.pkl', map_location= torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')))# 测试结果acc = tst(model=model, test_data=test_loader, vocab=vocab)print(acc)if __name__ == '__main__':main()

如果觉得《电影评论情感分析-IMDB数据集 | python 利用朴素贝叶斯神经网络模型》对你有帮助，请点赞、收藏，并留下你的观点哦！

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。