数据集:
?
LDA 用于将文档中的文本分类为特定主题。 它构建每个文档模型的主题和每个主题模型的单词,建模为 Dirichlet 分布。
每个文档被建模为主题的多项分布,每个主题被建模为单词的多项分布。 LDA 假设我们输入的每一块文本都将包含某种相关的单词。 因此,选择正确的数据语料库至关重要。 它还假设文档是从混合主题中产生的。 然后这些主题根据它们的概率分布生成单词。
code:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)
print(list(newsgroups_train.target_names))
# Lets look at some sample news
newsgroups_train.data[:2]
print(newsgroups_train.filenames.shape, newsgroups_train.target.shape)
'''
Loading Gensim and nltk libraries
'''
# pip install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk
nltk.download('wordnet')
print(WordNetLemmatizer().lemmatize('went', pos = 'v')) # past tense to present tense
import pandas as pd
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned',
'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational',
'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data={'original word':original_words, 'stemmed':singles })
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
# Tokenize and lemmatize
def preprocess(text):
result=[]
for token in gensim.utils.simple_preprocess(text) :
if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
result.append(lemmatize_stemming(token))
return result
'''
Preview a document after preprocessing
'''
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'
print("Original document: ")
words = []
for word in doc_sample.split(' '):
words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))
processed_docs = []
for doc in newsgroups_train.data:
processed_docs.append(preprocess(doc))
'''
Preview 'processed_docs'
'''
print(processed_docs[:2])
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
print(k, v)
count += 1
if count > 10:
break
'''
OPTIONAL STEP
Remove very rare and very common words:
- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
'''
Preview BOW for our sample preprocessed document
'''
document_num = 20
bow_doc_x = bow_corpus[document_num]
for i in range(len(bow_doc_x)):
print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0],
dictionary[bow_doc_x[i][0]],
bow_doc_x[i][1]))
# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus,
# num_topics = 10,
# id2word = dictionary,
# passes = 50)
# LDA multicore
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model = gensim.models.LdaMulticore(bow_corpus,
num_topics = 8,
id2word = dictionary,
passes = 10,
workers = 2)
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
print("Topic: {} \nWords: {}".format(idx, topic ))
print("\n")
num = 100
unseen_document = newsgroups_test.data[num]
print(unseen_document)
# Data preprocessing step for the unseen document
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))
print(newsgroups_test.target[num])
|