GuidedLDA
GuidedLDA copied to clipboard
seeded topics words are not getting importance
Hi @vi3k6i5 ,
I'm trying guided lda on six reviews data by initializing seed confiedence of 0.15, but they are not moving up the list as expected.
code below:
df = pd.DataFrame(corpus,columns=['Review'])
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.en import English import string from unidecode import unidecode import unicodedata
punctuations = string.punctuation stopwords = list(STOP_WORDS)
stopwords = set(stopwords)-{'not','on'}
parser = English() def spacy_tokenizer(sentence): mytokens = parser(sentence) mytokens = [ word.lemma_.lower().strip() if (word.lemma_ != "-PRON-" or word.lemma_ != "-X-") else word.lower_ for word in mytokens ] mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ] #mytokens = remPunct(remNumbers(remNonAscii(mytokens))) mytokens = " ".join([i for i in mytokens]) return mytokens
from tqdm import tqdm tqdm.pandas() df["cleaned_review"] = df['Review'].progress_apply(spacy_tokenizer)
all_review_list = [ review.split(' ') for review in df['cleaned_review']]
import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.tokenize import WordPunctTokenizer from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures
def get_bigrams(tokens): bigram_finder = BigramCollocationFinder.from_words(tokens) bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
for bigram_tuple in bigrams:
x = ' '.join(bigram_tuple)
tokens.append(x)
return tokens
import gensim
def bigrams(words, bi_min=10, tri_min=10): bigram = gensim.models.Phrases(words, min_count = bi_min) bigram_mod = gensim.models.phrases.Phraser(bigram) return bigram_mod
def get_corpus(words): bigram_mod = bigrams(words) bigram = [bigram_mod[review] for review in words] final_bigram = [] for gram in bigram: try: d = get_bigrams(gram) except: final_bigram.append(gram) continue final_bigram.append(d)
filtered_bigram = [[j for j in i if (j not in ['not','only','on'] and j.isdigit() == False)] for i in final_bigram]
id2word = gensim.corpora.Dictionary(final_bigram)
#id2word.filter_extremes(no_below=10, no_above=0.35)
id2word.compactify()
corpus = [id2word.doc2bow(text) for text in final_bigram]
return corpus, id2word, filtered_bigram
train_corpus, train_id2word, bigram_train = get_corpus(all_review_list)
vocab = [] for i in range(len(train_id2word)): vocab.append(train_id2word[i])
import numpy as np from gensim import matutils from gensim.matutils import corpus2csc
def bow_iterator(docs, dictionary): for doc in docs: yield dictionary.doc2bow(doc)
def get_term_matrix(msgs, dictionary): bow = bow_iterator(msgs, dictionary) X = np.transpose(matutils.corpus2csc(bow).astype(np.int64)) return X
X = get_term_matrix(bigram_train, train_id2word)
import guidedlda model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=42, refresh=20) model.fit(X)
topic_word = model.topic_word_ n_top_words = 20 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] print('Topic {}: {}'.format(i, ','.join(topic_words)))
seed_topic_list = [['late pickup','point nearly','arrive 1hour','30min destination'],['accord time','hour not','time schedule'] ]
model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=7, refresh=20)
word2id = dict((v, idx) for idx, v in enumerate(vocab))
seed_topics = {} for t_id, st in enumerate(seed_topic_list): for word in st: seed_topics[word2id[word]] = t_id
model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)
topic_word = model.topic_word_ n_top_words = 20 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] print('Topic {}: {}'.format(i, ','.join(topic_words)))