__author__ = "Amoli Rajgor"
__email__ = "amoli.rajgor@gmail.com"
__website__ = "amolir.github.io"
There’s a dedicated notebook for the data preparation stage containing implementation of all the intermediate steps. At the end of each stage the processed data is stored in the form of a CSV file. Current notebook will focus on stage 2 and stage 3.
I will be using the following list of packages for the project.
ℹ️ Dependencies
➤ numpy ≥ 1.22.3
➤ pandas ≥ 1.4.1
➤ scipy ≥ 1.8.0
➤ gensim ≥ 4.1.2
➤ spacy ≥ 3.3.0
➤ spacy-model-en_core_web_md ≥ 3.3.0
➤ nltk ≥ 3.5
➤ scikit-learn ≥ 1.0.2
➤ gsdmm ≥ 0.1
➤ pyldavis ≥ 3.3.1
➤ networkx
➤ corextopic ≥ 1.1
➤ pickle
➤ html
➤ re
➤ string
➤ collections
pip install requirements.txt
from the terminal to install all the dependencies before running the notebook. data
and place the downloaded .json file inside it.restaurant_review.csv
and aspect.csv
) will be stored in the data
folder itself.cleaned.pickle
. Similarly stopwords learned during the training are stored as pickle object stopwords.pickle
.stopwordslist.py
data_preparation.ipybn
-> aspect_extraction.ipynb
to generate results.# Data Manipulation
import pandas as pd
import numpy as np
# html parsing
import html
# RegEx and String Manipulation
import re
import string
# For Sentence Tokenization
from spacy.lang.en import English
# For Text Cleaning
import spacy
# Storing Objects
import pickle
# For finding frequent words
from collections import Counter
from nltk import FreqDist
# For Bigrams
from gensim.models import Phrases
from gensim.models.phrases import Phraser
# For LDA
from gensim import corpora
from gensim import models
# For GSDMM
from gsdmm import MovieGroupProcess
from gensim.models.coherencemodel import CoherenceModel
# Visualization
import matplotlib.pyplot as plt
# Visualize LDA
import pyLDAvis
import pyLDAvis.gensim_models
# For generating sparse data matrix
import scipy.sparse as ss
# For CorEx topic modelling
import corextopic.corextopic as ct
import corextopic.vis_topic as vt
# To generate bow
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)
# Import custom stopwords and topic words list
from stopwordslist import *
reviews = pd.read_csv("data/restaurant_review.csv")
print(f'\n\033[1mShape of the data: \033[0m {reviews.shape}', end="\n\n")
display(reviews.head())
From a quick inspection of the review texts, we see the presence of some HTML entities, incorrectly ending punctuations, incorrect placement of blank spaces etc. So before extracting features for the data, let's first clean it. Moreover, tokenize the texts into sentences to extract features. Steps involved in the text preparation stage are as follows:
"
to "
)!, ?,
and .
# html entity in the text
print("\n\033[1m{}\033[0m".format("Text before parsing the HTML entities:"))
print(reviews.review_text[11],end ="\n")
reviews["text"] = reviews["review_text"].apply(html.unescape)
# Entity removed
print("\n\033[1m{}\033[0m".format("Text after parsing the HTML entities:"))
print(reviews.text[11])
# Incorrect punctuation, sentence will not split
print("\n\033[1m{}\033[0m".format("Text with incorrect terminating punctuation in the first line:"))
print(reviews.text[52][-216:], end ="\n")
# Add space after punctuations
punct_trail_space_pattern = re.compile(r'(\d+\.\d+|\b[A-Z](?:\.[A-Z])*\b\.?)|([.,;:!?])\s*')
def add_trail_space(text):
return re.sub(punct_trail_space_pattern, lambda x: x.group(1) or f'{x.group(2)} ', text)
# Remove URLs from all tweets
reviews["text"] = reviews["text"].apply(add_trail_space)
print("\n\033[1m{}\033[0m".format("Text with corrected punctuation:"))
print(reviews.text[52][-218:])
# Text with multiple sentences
print("\n\033[1m{}\033[0m".format("Review containing multiple sentences:"))
print(reviews.review_text[1], end="\n")
# Empty pipeline with only language and no model
nlp = English()
nlp.add_pipe("sentencizer")
def sentence_tokenizer(doc):
return [sent.text for sent in doc.sents]
def tokenize_pipe(texts):
preproc_pipe = []
for doc in nlp.pipe(texts, batch_size=20):
preproc_pipe.append(sentence_tokenizer(doc))
return preproc_pipe
reviews['text'] = tokenize_pipe(reviews['text'])
print("\n\033[1m{}\033[0m".format("Review split into separate sentences:"))
display(reviews.text[1])
# Flatten the list, such that each sentence is assigned new row
reviews = reviews.explode("text", ignore_index=True)
reviews.head()
reviews.shape
;
) using simple regex.# Clause seperation
print("\n\033[1m{}\033[0m".format("Review with clauses:"))
print(reviews.text[410])
# print(reviews.text[2], end="\n\n")
# Split sentences having multiple clauses seperated by ";"
def split_sentence_clauses(text):
return re.split(r"(?<!\w\;\w.)(?<![A-Z][a-z]\;)(?<=\;)\s", text)
# return re.split(r"(?<!\w\;\w.)(?<![A-Z][a-z]\;)(?<=\;)\s|(?<!\w\,\w.)(?<![A-Z][a-z]\,)(?<=\,)\s", text)
reviews["text"] = reviews["text"].apply(split_sentence_clauses)
print("\n\033[1m{}\033[0m".format("Clauses split as documents:"))
print(reviews.text[410])
# print(reviews.text[2])
# Flatten the list, such that each sentence is assigned new row
reviews = reviews.explode("text", ignore_index=True)
# Clauses are split
display(reviews.iloc[413:415])
# display(reviews.iloc[np.r_[2:5, 569:571]])
reviews.shape
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
nlp.add_pipe('sentencizer')
stopwords = set(nlp.Defaults.stop_words)
def tokenize_lda_pipe(doc, stopwords, pos_list):
lemma_list = [token.text.lower() for token in doc if token.is_alpha and
token.pos_ in pos_list and len(token)>2 and
token.text.lower() not in stopwords]
return " ".join(lemma_list)
def preprocess_lda_pipe(texts, stopwords, pos_list = ["NOUN", "ADJ", "VERB"]):
processed_pipe = []
for doc in nlp.pipe(texts, batch_size=1000):
processed_pipe.append(tokenize_lda_pipe(doc, stopwords, pos_list=pos_list))
return processed_pipe
%time reviews['lda_tokens'] = preprocess_lda_pipe(reviews['text'], stopwords, ["NOUN", "ADJ", "VERB", "ADV"])
%time reviews['lda_tokens_noun'] = preprocess_lda_pipe(reviews['text'], stopwords, ["NOUN"])
# Remove empty lists
reviews = reviews[reviews['lda_tokens'].map(lambda d: len(d)) > 0].copy()
reviews = reviews.reset_index().rename(columns={"index":'review_id', "id":"restaurant_id"})
reviews.head(8)
# Find all the tokens
review_tokens = reviews.lda_tokens.str.split(" ").tolist()
# Train the model to learn ngrams
bigram_model = Phrases(review_tokens, min_count=30, threshold=5)
bigram_phraser = Phraser(bigram_model)
trigram_model = Phrases(bigram_model[review_tokens], min_count=15, threshold=5)
trigram_phraser = Phraser(trigram_model)
def get_ngram(doc):
l = [token.text for token in doc]
l = trigram_phraser[bigram_phraser[l]]
return " ".join(l)
def ngram_lda_pipe(texts):
ngram_pipe = []
for doc in nlp.pipe(texts, batch_size=1000):
ngram_pipe.append(get_ngram(doc))
return ngram_pipe
%time reviews['lda_tokens'] = ngram_lda_pipe(reviews['lda_tokens'])
reviews.head()
# with open('cleaned.pickle', 'wb') as handle:
# pickle.dump(reviews, handle, protocol=pickle.HIGHEST_PROTOCOL)
Find the most occurring words from the corpus and use this information wisely to create a relevant set of stopwords. We will not remove all the most frequent words because they are keywords in determining aspects such as Food, Service Staff Ambience etc. Moreover later, the prominent 1000 words are used to form seed words for semisupervised LDA.
# Frequency distribution of all the words
def get_most_frequent_words(all_words_list, word_limit = 1000, remove_stopwords = False):
corpus_list = [word for word_list in all_words_list for word in word_list]
if remove_stopwords:
stopwords = set(nlp.Defaults.stop_words)
corpus_list = [word for word in corpus_list if word.lower() not in stopwords]
f_dist = FreqDist(corpus_list)
if word_limit:
# Word limit not set return word_limit number of word in descreasing order of occurence
top_words, _ = zip(*f_dist.most_common(word_limit))
else:
# Word limit not set return all the word in descreasing order of occurence
top_words, _ = zip(*f_dist.most_common())
return top_words
# Get a list of most frequently used words
# print(get_most_frequent_words(reviews.loc[:, 'lda_tokens'].str.split(), 1000, True)[0:100])
frequent_words_list = get_most_frequent_words(reviews.loc[:, 'lda_tokens'].str.split(), None, False)
print(f'\n\033[1m Most Frequent Words: \033[0m {frequent_words_list[1:100]}')
print(f'\n\033[1m Total number of words in the vocabulary:\033[0m {len(frequent_words_list)}')
Choose a threshold to remove least occurring words. Here we find words that occur less than 0.001% of the total words in the corpus.
# Threshold showing percentage occurrance of the word
threshold = 0.001
all_word_list = reviews.lda_tokens.str.split(" ").to_list()
corpus_list = [word for word_list in all_word_list for word in word_list]
topic_occurence = Counter(corpus_list).most_common()
topic_occurence = [(topic, freq, round(float(freq) * 100 /len(corpus_list),4)) for topic, freq in topic_occurence]
less_than_point_001_percent_lemma = [topic for topic, freq, percent in topic_occurence if percent < threshold]
# less_than_5_freq_lemma = [topic for topic, freq, percent in topic_occurence if freq < 5]
with open('stopwords.pickle', 'wb') as handle:
pickle.dump(less_than_point_001_percent_lemma, handle, protocol=pickle.HIGHEST_PROTOCOL)
# pickle.dump(less_than_5_freq_lemma, handle, protocol=pickle.HIGHEST_PROTOCOL)
# topic_occurence
print("\n\033[1m{}\033[0m".format("List of words occurring less than 0.001% times:"))
print(less_than_point_001_percent_lemma[1:10])
print("\n\033[1m{}\033[0m".format("Top words occurring less than 0.001% times:"))
topic_occurence[-10::1]
It can be observed that the majority of words in the least occurring list are misspelt words, spell checking is a processing heavy task hence we are not explicitly correcting words, though doing so will improve lemmatization result but assuming there won't be much misspelt words we are simply removing them from the vocabulary now. Spelling correction function as shown below (demo) can be included as part of the data cleaning pipeline.
from nltk.metrics.distance import edit_distance
from nltk.corpus import words
correctly_spelled_words = words.words()
misspelled = ['acomadating', 'attachd', 'distence']
for word in misspelled:
temp = [(edit_distance(word, w),w) for w in correctly_spelled_words if w[0]==word[0]]
print(sorted(temp, key = lambda val:val[0])[0][1])
One part of the training process involves updating the list of stopwords with words that are found irrelevant after running the model. These words appear as the top words for a topic and don't really add much to the semantics of the topic content. For example, we are interested in keeping VERB tokens during lemmatization that adds meaning to the reviews such as: eat, serve, dine, recommend, complain, clean etc. but, doing so also retains tokens like catch, allow, save, suppose, speak, carry, move etc. which are irrelevant and ambiguous and can represent multiple topics. Because of this we have to constantly refine the vocabulary to only keep tokens that are useful in creating distinct topics, thus we will use the cleaned reviews data for every fresh run of the model and update the lemma_stopwords
list based on the results.
with open('cleaned.pickle', 'rb') as handle:
reviews = pickle.load(handle)
reviews.head()
with open('stopwords.pickle', 'rb') as handle:
less_than_point_001_percent_lemma = pickle.load(handle)
# less_than_5_freq_lemma = pickle.load(handle)
print(less_than_point_001_percent_lemma[0:10])
# print(less_than_5_freq_lemma[0:10])
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])
nlp.add_pipe('sentencizer')
stopwords = set(nlp.Defaults.stop_words)
custom_stopwords = custom_stopwords.union(lemma_stopwords).union(stopwords).union(set(less_than_point_001_percent_lemma))
print(f'\n\033[1mTotal number of stopwords to be removed: \033[0m {len(custom_stopwords)}')
# Refine Lemmatization
def lemmatization(doc, stopwords):
lemma_list = [str(token.lemma_) for token in doc if
len(token)>2 and
token.lemma_ not in stopwords and token.text not in stopwords]
return lemma_list
def lemmatize_lda_pipe(texts, stopwords):
processed_pipe = []
for doc in nlp.pipe(texts, batch_size=1000):
processed_pipe.append(lemmatization(doc, stopwords))
return processed_pipe
# Lemmatize
%time reviews['topic_words'] = lemmatize_lda_pipe(reviews["lda_tokens"], custom_stopwords)
# Lemmatize
%time reviews['lda_tokens_noun'] = lemmatize_lda_pipe(reviews["lda_tokens_noun"], stopwords)
reviews.shape
reviews.head(3)
LDA is an unsupervised algorithm used to extract topics from the document. In case of topic modelling we have two basic scenarios:
food
and drinks
and service
, warm applies to both food
and ambience
)
Dirichlet distributions are probability distributions that are continuous and multivariate and can model the relation described above. Application of LDA on the documents generates two sets of information. Topics per document and words per topic. Number of topics is a hyperparameter that is to be learned during the training and is a prerequisite to running the model.import logging
logging.basicConfig(filename='gensim.log',
format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.DEBUG)
lda_params = {"num_topics" : 35,
"alpha" : "auto",
"eta" : "auto",
"passes" : 5,
"iterations": 50,
"per_word_topics": True}
def get_vocabulary(tokens):
dictionary_lda = corpora.Dictionary(tokens)
corpus = [dictionary_lda.doc2bow(tok) for tok in tokens]
return {#"tokens" : tokens,
"dictionary_lda" : dictionary_lda,
"corpus" : corpus}
def get_lda_model(corpus, dictionary_lda, num_topics = 13, passes = 5, iterations = 50, alpha="auto", eta="auto", per_word_topics=True):
np.random.seed(49)
lda_model = models.LdaModel(corpus, num_topics=num_topics, \
id2word=dictionary_lda, \
random_state=49, update_every=1, \
iterations=iterations, passes=passes,\
alpha=alpha, eta=eta, \
per_word_topics=per_word_topics) #alpha=[0.01]*num_topics [0.01]*len(dictionary_LDA.keys())
return lda_model
def get_model_performance(parameter, cases, texts, lda_params, coherence="c_v"):
performance = {parameter:[], "coherence":[]}
for c in cases:
lda_params.update({parameter : c})
performance[parameter].append(c)
lda_model = get_lda_model(**lda_params)
coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=lda_params["dictionary_lda"], coherence=coherence)
performance["coherence"].append(coherence_model.get_coherence())
return performance
texts = reviews['topic_words'].tolist()
lda_params.update(get_vocabulary(texts))
num_topics = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
%time performance_num_topics = get_model_performance("num_topics", num_topics, texts, lda_params, "c_v")
alpha = [0.01, 0.05, 0.1, 0.5, 1, 1.5, 2, 2.5, 3, 3.5]
lda_params.update({"num_topics" : 35})
%time performance_alpha = get_model_performance("alpha", alpha, texts, lda_params, "c_v")
eta = [0.01, 0.03, 0.05, 0.07, 0.1]
lda_params.update({"num_topics" : 35, "alpha": 0.1})
%time performance_eta = get_model_performance("eta", eta, texts, lda_params, "c_v")
def plot_performance(x, y, plot_title, x_label, y_label):
plt.plot(x,y)
plt.scatter(x,y)
plt.title(plot_title)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.xticks(x)
# plt.show()
%matplotlib inline
fig = plt.figure(figsize=(16,4))
plt.subplot(1, 3, 1)
plot_performance(performance_num_topics["num_topics"], performance_num_topics["coherence"], 'Number of Topics vs. Coherence', 'Number of Topics', 'Coherence')
plt.subplot(1, 3, 2)
plot_performance(performance_alpha["alpha"], performance_alpha["coherence"], 'Alpha Values vs. Coherence', 'Alpha Values', 'Coherence')
plt.subplot(1, 3, 3)
plot_performance(performance_eta["eta"], performance_eta["coherence"], 'Eta Values vs. Coherence', 'Eta Values', 'Coherence')
plt.tight_layout()
plt.show()
For alpha
and eta
hyperparameters learned as asymmetric prior using the corpus automatically, we see that the increase in the number of topics beyond the value 35 results in almost constant coherence. Thus we will set the number of topics to 35. For number of topics 35, alpha value 0.1 yields highest coherence. Alpha values typically decides topic-document density, so if it is expected that multiple topics will be distributed over multiple documents, or repharsing: If a document is expected to contain multiple topics, then set this value high. Similarly, eta
governs word-topic density, it controls distribution of words over multiple topics. Set this value as low as possible to yield topic seperability. If this value is set high then different topics will contain same words. Thomas L. Griffiths and Mark Steyvers in Finding scientific topics recommend on setting alpha
value as ratio $\frac{50}{\text{number of topics}}$ and eta
as $0.1$.
lda_params.update(get_vocabulary(texts))
# lda_params.update({"num_topics": 35, "alpha": 0.1, "eta": 0.01})
lda_params.update({"num_topics": 35, "alpha": "auto", "eta": "auto"})
# lda_params.update({"alpha": 50/lda_params["num_topics"], "eta": 200/len(lda_params["dictionary_lda"]) })
%time lda_model = get_lda_model(**lda_params)
for i,topic in lda_model.show_topics(formatted=True, num_topics=lda_params["num_topics"], num_words=20):
print(str(i)+": "+ topic)
print()
From the topic distribution above we can see LDA binding some interrelared words together:
No. | Aspect | Tag | Description | Words |
---|---|---|---|---|
1. | Food | food |
Words describing the food item and its characteristics. | Food, Meal, Delicious, Tasty, Portion, Flavor, Vegetarian, Yummy, Seafood, Spicy, Eat, Salty, Tender, Soggy, Snack, Ingredient, Quality Food, Roasted etc. |
2. | Order | order |
Words describing the order placed at the restaurant. | Order, Dish, Entree, Dine, Italian, Starter, Bowl, Chef, Plate, Main Course, Appetizer, Snack, Mexican Food, Cuisine, Decided try, Specialty, Platter, Chinese Food etc. |
3. | Service | service |
Words describing the service offered by the restaurant | Service, Friendly, Service Excellent, Service Slow, Customer Service, Professionlaism, Management, Service Fast, Helpful, unfriendly, Service Prompt, Service Attentive etc. |
4. | Recommendation | recommendation |
Words describing recommendation, either given by the diner or others. | Recommendation, Review, Return, Star, Rating, Suggest, Worth Visit, Definitely Recommend, Highly Recommended |
5. | Bar or Beverage | bar_beverage |
Words referring to bar, drinks or beverage. | Beer, Bar, Stout, Champagne, Cocktail, Happy Hour, Juice, Wine, Drink, Coffee, Tea, Glass, Milkshake, Bottle, Lemonade, Wine Selection, Alcohol, Brew, Beverage, Sampler, Refreshing etc. |
6. | Ambience | ambience |
Words describing surrounding of the restaurant. | Ambience, Decor, Warm, Busy, Crowd, Scene, Vibe, Neighbourhood, Cozy, Inviting, Welcoming, Relax, Live Music, Sunny, Theme, Loud, Environment, Lively, Interior, Music, View etc. |
7. | Place or Location | place_location |
Words decribing the restaurant location or nearby places. | Location, Place, Spot, Space, Street, View, Locate, Corner, Local, Market, Establishment, Section, Joint, Pub, Store, Stop, Clean, Floor |
8. | Experience | experience |
Words referring to the diner experience or feedback. | Experience, Wait, visit, Enjoyable, Fault, Long Wait, Spend, Queue, Lost, Welcoming, Great Value, Complain, Lacking, Comfortable, Line, Special Occassion, Rush, Dining Experience etc. |
9. | Dessert | dessert |
Words decribing sweets or desserts. | Dessert, Sweet, Ice cream, Cheesecake, Pudding, Pastry, Sugar, Crepe, Mousse, Chocolate, Cake, Pancake, Waffle, Baked, Whipped Cream, Creamy, Pie, etc. |
10. | Price | price |
Words reffering to the monetary aspects. | Price, Cheap, Expensive, Bill, cost, Prices Reasonable, Affordable, Overprice, Cash, Pricy, Spend, Good Value, Money, tip, Pay, Inexpensive, Charge, Wort Price etc. |
11. | Menu or Offering | menu_offering |
Words referring to the variety of food items offered at the restaurant. | Menu, Dish, Choice, Serve, Buffet, Homemade, Offer, Main Course, Tasting Menu, Option, Selection, Starter, Cuisine, Staple, Vegan, Limited, Snack, Variety |
12. | Staff or Team | staff_team |
Words relating to the personnels at the restaurant. | Wait Staff, Waiter, Owner, Chef, Server, Staff, Waitress, Bartender, Hostess, Team, Host, Management, Friendly, Professional, Welcoming, Polite etc. |
13. | Facility | facility |
Words describing facilities available at the restaurant | Bar, Dining Room, Cafe, Outdoor, Kitchen, Parking, Garden, Seating, Store, Cooking, Patio, Indoor, Bathroom, Fine Dining, Roof, Terrace, Outdoor Seating, Screen etc. |
print(reviews.text.iloc[0])
print(reviews.lda_tokens.iloc[0])
print(reviews.topic_words.iloc[0])
print()
print(lda_model.get_document_topics(lda_params["corpus"][0]))
max(lda_model.get_document_topics(lda_params["corpus"][0]),key=lambda x: x[1])
len(lda_params["dictionary_lda"].keys())
For short text GSDMM based on Gibbs sampling generates better results. It requires setting an upper threshold for the maximum number of topics expected k
. Learning from the earlier LDA run, we will set this value more than or equal to 13 as we can clearly expect at least 13 topics from the reviews data. Apart from that we will set the optimal values for alpha
and beta
(eta
in LDA) as $0.1$ and $0.01$ respectively, as learned from LDA.
# Create corpora nd dictionary similar to LDA
docs = reviews.topic_words.to_numpy()
dictionary = corpora.Dictionary(docs)
vocab_length = len(dictionary)
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]
# gsdmm_model = MovieGroupProcess(K=13, alpha=0.07, beta=0.07, n_iters=20)
gsdmm_model = MovieGroupProcess(K=15, alpha=0.1, beta=0.01, n_iters=20)
%time y = gsdmm_model.fit(docs, vocab_length)
doc_count = np.array(gsdmm_model.cluster_doc_count)
print(f'\033[1mNumber of documents per topic : \033[0m{doc_count}')
topic_clusters_importance = doc_count.argsort()[-15:][::-1]
print(f'\033[1mMost important clusters based on number of documents inside it: \033[0m {topic_clusters_importance}')
def top_words(cluster_word_distribution, top_cluster, values):
for cluster in top_cluster:
sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
print("\nCluster %s : %s"%(cluster, sort_dicts))
# Get the top words in topics
top_words(gsdmm_model.cluster_word_distribution, topic_clusters_importance, 30)
From the results above we can see some meaningful clusters forming such as:
print(f'\033[1mTopic distribution for a document \033[0m', end="\n\n")
print(reviews.topic_words.iloc[0])
print(gsdmm_model.score(docs[0]))
# # Assign each document to most prevalent topic
# reviews["topic_gsdmm"] = y
# reviews['aspect_gsdmm'] = reviews["topic_gsdmm"].map({0:'food', 1:'bar_beverage', 2:'price', 3:'dessert', 4:'food', 5:'experience', 6:'menu_offering',
# 7:'recommendation', 8:'service', 9:'place_location', 10:'ambience', 11:'staff_team', 12:'order'})
# reviews.head(30)
# reviews_lda[["id", "text", "topic_words", "aspect", "topic_gsdmm", "aspect_gsdmm"]].to_csv("data/final_aspect.csv", index = False)
eta_mat = np.full((lda_params["num_topics"], len(lda_params["dictionary_lda"])), fill_value=(1))
for word, topic in seed_words.items():
word_index = [key for key,term in lda_params["dictionary_lda"].items() if term==word]
if (len(word_index)>0):
eta_mat[topic, word_index[0]] = 1e10
# Divide each value with the sum total of values for the topic to make it probability
eta_mat = np.divide(eta_mat, eta_mat.sum(axis=0))
# Use prior probability matrix as the new eta
lda_params.update({"eta" : eta_mat, "alpha": "auto"})
lda_params["eta"]
%time lda_model_prior = get_lda_model(**lda_params)
for i,topic in lda_model_prior.show_topics(formatted=True, num_topics=lda_params["num_topics"], num_words=30):
print(str(i)+": "+ topic)
print()
print(reviews.text.iloc[0])
print(reviews.topic_words.iloc[0], end="\n\n")
print(f'\033[1mTopic distribution for a document \033[0m', end="\n\n")
print(lda_model_prior.get_document_topics(lda_params["corpus"][0]))
print(f'\033[1mTopic distribution for a document \033[0m', end="\n\n")
max(lda_model_prior.get_document_topics(lda_params["corpus"][0]),key=lambda x: x[1])
# Assign each document to most prevalent topic
reviews["topic_sslda"] = [max(p,key=lambda item: item[1])[0] for p in lda_model_prior.get_document_topics(lda_params["corpus"])]
reviews['aspect_sslda'] = reviews["topic_sslda"].map(topic_map)
# Assign empty topic words with miscellaneous aspect
reviews.aspect_sslda = np.where(reviews['topic_words'].map(lambda d: len(d)) == 0, 'miscellaneous', reviews.aspect_sslda)
reviews.head()
texts = reviews['topic_words'].tolist()
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=lda_params["dictionary_lda"], coherence="c_v")
coherence_model_lda_prior = CoherenceModel(model=lda_model_prior, texts=texts, dictionary=lda_params["dictionary_lda"], coherence="c_v")
print(coherence_model_lda.get_coherence())
print(coherence_model_lda_prior.get_coherence())
vis = pyLDAvis.gensim_models.prepare(topic_model=lda_model, corpus=lda_params["corpus"], dictionary=lda_params["dictionary_lda"])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)
vis_sslda = pyLDAvis.gensim_models.prepare(topic_model=lda_model_prior, corpus=lda_params["corpus"], dictionary=lda_params["dictionary_lda"])
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_sslda)
def get_corex_model(tokens, num_topics = 30, iterations=200, anchor_words=None, anchor_strength=2):
bow_cv = CountVectorizer(binary=True)
# Matrix of shape documents x vocabulary words
doc_word_mat = bow_cv.fit_transform(tokens)
doc_word_mat = ss.csr_matrix(doc_word_mat)
# Get words in the vocabulary
vocab_words = list(np.asarray(bow_cv.get_feature_names_out()))
# Remove numeric tokens
valid_idxs = [idx for idx,word in enumerate(vocab_words) if not word.isdigit()]
doc_word_mat = doc_word_mat[:,valid_idxs]
vocab_words = [word for idx,word in enumerate(vocab_words) if not word.isdigit()]
# Train the CorEx topic model with 13 topics
corex_model = ct.Corex(n_hidden=num_topics, words=vocab_words, max_iter=iterations, verbose=False, seed=1)
# corex_model.fit(doc_word_mat, words=vocab_words)
corex_model.fit(doc_word_mat, words=vocab_words, anchors=anchor_words, anchor_strength=anchor_strength)
return corex_model
%time corex_model = get_corex_model(tokens=reviews["lda_tokens_noun"].map(lambda x: " ".join(x)))
# Print a single topic from CorEx topic model
corex_model.get_topics(topic=1, n_words=10)
# Topic words list
for n,t in enumerate(corex_model.get_topics(n_words=10)):
topic_words,_,_ = zip(*t)
print('{}: '.format(n) + ', '.join(topic_words))
print(corex_model.p_y_given_x.shape) # documents x num_topics
corex_model.p_y_given_x[0]
corex_model.labels[0]
seed_words_list = dict(sorted(seed_words.items(), key=lambda item: item[1]))
anchor_words = []
for val in set(seed_words_list.values()):
anchor_words.append([k for k,v in seed_words_list.items() if v == val])
len(anchor_words)
%time corex_model_guided = get_corex_model(tokens=reviews["topic_words"].map(lambda x: " ".join(x)), num_topics=13, anchor_words=anchor_words)
# Print a single topic from CorEx topic model
corex_model_guided.get_topics(topic=1, n_words=10)
# Topic words list
for n,t in enumerate(corex_model_guided.get_topics(n_words=10)):
topic_words,_,_ = zip(*t)
print('{}: '.format(n) + ', '.join(topic_words))
print(corex_model_guided.p_y_given_x.shape) # documents x num_topics
corex_model_guided.p_y_given_x[0]
corex_model_guided.labels[0]
# No topic assigned to the document
print(f'\033[1mReview irrelevant to any topic : \033[0m{reviews.loc[123,"text"]}')
print(f'\033[1mNot assigned to any topic? : \033[0m{(~corex_model_guided.labels[123]).all()}')
# reviews["aspect_corex"] = [topic_list[np.argmax(prob)] for prob in corex_model.p_y_given_x]
reviews["aspect_corex"] = [np.array(topic_list)[t][0] if t.any() else 'miscellaneous' for t in corex_model_guided.labels]
# Assign empty topic words with miscellaneous aspect
reviews.aspect_corex = np.where(reviews['topic_words'].map(lambda d: len(d)) == 0, 'miscellaneous', reviews.aspect_corex)
reviews.head(3)
# reviews.to_csv("data/corex_model_aspect.csv", index=False)
# Two aspects in same sentence
print(reviews.text[407])
np.array(topic_list)[corex_model_guided.labels[407]]
display(reviews[["text", "topic_words", "aspect_corex"]].loc[403:411])
reviews[["restaurant_id", "review_title", "text", "topic_words", "aspect_corex"]].loc[np.r_[191:196, 77:87, 133:139]]