Source code for cwordtm.tm

# tm.py
#    
# Topic modeling with LDA, NMF, and BERTopic for a prescribed range of
#   Scripture or other text
#
# Copyright (c) 2025 CWordTM Project 
# Author: Johnny Cheng <drjohnnycheng@gmail.com>
#
# Updated: 18-Jun-2024 (0.6.4), 21-Nov-2024, 14-Jan-2025, 29-Jan-2025 (0.7.4)
#
# URL: https://github.com/drjohnnycheng/cwordtm.git
# For license information, see LICENSE.TXT


# Dependencies

import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np
import pandas as pd
import string
import re
import math

import time
from pprint import pprint
from IPython.display import IFrame
from importlib_resources import files

import jieba
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
from nltk.util import ngrams
import nltk

from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel

import torch
from bertopic import BERTopic
from transformers import BertTokenizer, BertModel

import matplotlib
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

from . import util



[docs]
def load_text(textfile, doc_size=0, text_col='text'):
    """Loads and returns the list of documents from the prescribed file ('textfile').

    :param textfile: The prescribed text file from which the text is loaded,
        default to None
    :type textfile: str
    :param nr: The number of rows of text to be loaded; 0 represents all rows,
        default to 0
    :type nr: int, optional
    :param doc_size: The number of documents to be processed, 0 represents all documents,
        or the range (tuple) of documents to be processed, default to 0
    :type doc_size: int, tuple, optional
    :param text_col: The name of the text column to be extracted, default to 'text'
    :type text_col: str, optional
    :return: The list of documents loaded
    :rtype: list
    """

    docs = util.load_text(textfile, doc_size, text_col)
    return list(docs[text_col])




[docs]
def load_bible(textfile, cat=0, group=True):
    """Loads and returns the Bible Scripture from the prescribed internal
    file ('textfile').

    :param textfile: The package's internal Bible text from which the text is loaded,
        either World English Bible ('web.csv') or Chinese Union Version (Traditional)
        ('cuv.csv'), default to None
    :type textfile: str
    :param cat: The category indicating a subset of the Scripture to be loaded, where
        0 stands for the whole Bible, 1 for OT, 2 for NT, or one of the ten categories
        ['tor', 'oth', 'ket', 'map', 'mip', 'gos', 'nth', 'pau', 'epi', 'apo'] (See 
        the package's internal file 'data/book_cat.csv'), default to 0
    :type cat: int or str, optional
    :param group: The flag indicating whether the loaded text is grouped by chapter,
        default to True
    :type group: bool, optional
    :return: The collection of Scripture loaded
    :rtype: pandas.DataFrame
    """

    # textfile = "web.csv"
    scfile = files('cwordtm.data').joinpath(textfile)
    print("Loading Bible '%s' ..." %scfile)
    df = pd.read_csv(scfile)

    cat_list = ['tor', 'oth', 'ket', 'map', 'mip',\
                'gos', 'nth', 'pau', 'epi', 'apo']	
    cat = str(cat)
    if cat == '1' or cat == 'ot':
        df = util.extract(df, testament=0)
    elif cat == '2' or cat == 'nt':
        df = util.extract(df, testament=1)               
    elif cat in cat_list:
        df = util.extract(df, category=cat)

    if group:
        # Group verses into chapters
        df = df.groupby(['book_no', 'chapter'])\
                        .agg({'text': lambda x: ' '.join(x)})\
                .reset_index()

    df.text = df.text.str.replace('　', '')
    return list(df.text)


   

[docs]
def process_text(doc):
    """Processes the English text through tokenization, converting to lower case,
    removing all digits, stemming, and removing punctuations and stopwords.

    :param doc: The prescribed text, in form of a string, to be processed,
        default to None
    :type doc: str
    :return: The list of the processed strings
    :rtype: list
    """

    # List of punctuation marks
    punc = list(set(string.punctuation))

    # List of stop words
    add_stop = []
    stop_words = ENGLISH_STOP_WORDS.union(add_stop)

    doc = TweetTokenizer().tokenize(doc)
    doc = [each.lower() for each in doc]
    doc = [re.sub('[0-9]+', '', each) for each in doc]
    # doc = [SnowballStemmer('english').stem(each) for each in doc]
    doc = [WordNetLemmatizer().lemmatize(each) for each in doc]
    doc = [w for w in doc if w not in punc]
    doc = [w for w in doc if w not in stop_words]
    doc = [w for w in doc if len(w) > 1]

    return doc




[docs]
class LDA:
    """The LDA object for Latent Dirichlet Allocation (LDA) modeling.
    
    :cvar num_topics: The number of topics to be modeled, default to 10
    :vartype num_topics: int
    :ivar doc_file: The filename of the text file to be processed
    :vartype doc_file: str
    :ivar chi: The flag indicating whether the processed text is in Chinese or not,
        True stands for Traditional Chinese or False for English
    :vartype chi: bool
    :ivar num_topics: The number of topics set for the topic model
    :vartype num_topics: int
    :ivar docs: The collection of the original documents to be processed
    :vartype docs: pandas.DataFrame or list
    :ivar pro_docs: The collection of documents, in form of list of lists of words
        after text preprocessing
    :vartype pro_docs: list
    :ivar dictionary: The dictionary of word ids with their tokenized words
        from preprocessed documents ('pro_docs')
    :vartype dictionary: gensim.corpora.Dictionary
    :ivar corpus: The list of documents, where each document is a list of tuples
        (word id, word frequency in the particular document)
    :vartype corpus: list
    :ivar model: The LDA model object
    :vartype model: gensim.models.LdaModel
    :ivar vis_data: The LDA model's prepared data for visualization
    :vartype vis_data: pyLDAvis.PreparedData
    """


[docs]
    def __init__(self, doc_file, num_topics, chi=False):
        """Constructor method.
        """

        self.doc_file = doc_file
        self.num_topics = num_topics
        self.chi = chi
        self.docs = None
        self.pro_docs = None
        self.dictionary = None
        self.corpus = None
        self.model = None
        self.vis_data = None


    

[docs]
    def preprocess(self):
        """Process the original English documents (cwordtm.tm.LDA.docs)
        by invoking cwordtm.tm.process_text, and build a dictionary and
        a corpus from the preprocessed documents for the LDA model.
        """

        self.pro_docs = [process_text(doc) for doc in self.docs]

        for i, doc in enumerate(self.pro_docs):
            self.pro_docs[i] += ["_".join(w) for w in ngrams(doc, 2)]
            # self.pro_docs[i] += ["_".join(w) for w in ngrams(doc, 3)]

        # Create a dictionary and corpus for the LDA model
        self.dictionary = corpora.Dictionary(self.pro_docs)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.pro_docs]




[docs]
    def preprocess_chi(self):
        """Process the original Chinese documents (cwordtm.tm.LDA.docs) 
        by tokenizing text, removing stopwords, and building a dictionary
        and a corpus from the preprocessed documents for the LDA model.
        """

        # Build stop words
        stop_file = files('cwordtm.data').joinpath("tc_stopwords_2.txt")
        stopwords = [k[:-1] for k in open(stop_file, encoding='utf-8')\
                     .readlines() if k != '']

        # Tokenize the Chinese text using Jieba
        dict_file = files('cwordtm.data').joinpath("user_dict_4.txt")
        jieba.load_userdict(str(dict_file))
        docs = [jieba.cut(doc) for doc in self.docs]

        # Replace special characters
        docs = [[word.replace('\u3000', ' ') for word in doc] \
                                     for doc in docs]

        # Remove stop words
        self.pro_docs = [' '.join([word for word in doc if word not in stopwords]) \
                                        for doc in docs]

        self.pro_docs = [doc.split() for doc in self.pro_docs]

        # Create a dictionary and corpus
        self.dictionary = corpora.Dictionary(self.pro_docs)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.pro_docs]




[docs]
    def fit(self):
        """Build the LDA model with the created corpus and dictionary.
        """

        self.model = models.LdaModel(self.corpus, 
                            num_topics=self.num_topics, 
                            id2word=self.dictionary, 
                            passes=10)


    

[docs]
    def viz(self, web_app=False):
        """Shows the Intertopic Distance Map for the built LDA model.

        :param web_app: The flag indicating the function is initiated from a web
            application, default to False
        :type web_app: bool
        """

        self.vis_data = gensimvis.prepare(self.model, self.corpus, self.dictionary)
        # pyLDAvis.enable_notebook()
        pyLDAvis.display(self.vis_data)
        # print("If no visualization is shown,")
        print("You may execute the following commands to show the visualization:")
        print("    import pyLDAvis")
        print("    pyLDAvis.display(lda.vis_data)\n")




[docs]
    def show_topics(self):
        """Shows the topics with their keywords from the built LDA model.
        """

        print("\nTopics from LDA Model:")
        pprint(self.model.print_topics())


    

[docs]
    def evaluate(self):
        """Computes and outputs the coherence score, perplexity, topic diversity,
            and topic size distribution.
        """

        # Compute coherence score
        coherence_model = CoherenceModel(model=self.model,
                                         texts=self.pro_docs,
                                         dictionary=self.dictionary,
                                         coherence='c_v')
        print(f"  Coherence: {coherence_model.get_coherence()}")
        
        # Compute perplexity
        perplexity = self.model.log_perplexity(self.corpus)
        print(f"  Perplexity: {perplexity}")
        
        # Compute topic diversity
        topic_sizes = [len(self.model[self.corpus[i]]) for i in range(len(self.corpus))]
        total_docs = sum(topic_sizes)
        topic_diversity = sum([(size/total_docs)**2 for size in topic_sizes])
        print(f"  Topic diversity: {topic_diversity}")
        
        # Compute topic size distribution
        # topic_sizes = [len(self.model[self.corpus[i]]) for i in range(len(self.corpus))]
        topic_size_distribution = max(topic_sizes) / sum(topic_sizes)
        print(f"  Topic size distribution: {topic_size_distribution}\n")




[docs]
    def save(self, file):
        """Saves the built LDA model to the specified file.

        :param file: The name of the file to store the built model, default to None
        :type file: str
        """

        if file is None or len(file.strip())==0:
            print("No valid filename has been specifid!")
            return

        if file.split('.')[-1] == file:
            file += '.gensim'

        self.model.save(file)
        print(f"LDA model has been stored in {file!r}.")




[docs]
    def load(self, file):
        """Loads the stored LDA model from the specified file.

        :param file: The name of the file to be loaded, default to None
        :type file: str
        :return: The loaded LDA model
        :rtype: gensim.models.LdaModel
        """

        if file is None or len(file.strip())==0:
            print("No valid filename has been specifid!")
            return

        if file.split('.')[-1] == file:
            file += '.gensim'

        return models.LdaModel.load(file)



# End of LDA Class



[docs]
def lda_process(doc_file, num_topics=10, source=0, text_col='text', doc_size=0, cat=0, chi=False, group=True, eval=False, web_app=False):
    """Pipelines the LDA modeling.

    :param doc_file: The filename of the prescribed text file to be loaded,
        or a BytesIO object from Streamlit's file_uploader, default to None
    :type doc_file: str or io.BytesIO
    :param num_topics: The number of topics to be modeled, default to 10
    :type num_topics: int, optional
    :param source: The source of the prescribed document file ('doc_file'),
        where 0 refers to internal store of the package and 1 to external file,
        default to 0
    :type source: int, optional
    :param text_col: The name of the text column to be extracted, default to 'text'
    :type text_col: str, optional
    :param doc_size: The number of documents to be processed, 0 represents all documents,
        or the range (tuple) of documents to be processed, default to 0
    :type doc_size: int, tuple, optional
    :param cat: The category indicating a subset of the Scripture to be loaded, where
        0 stands for the whole Bible, 1 for OT, 2 for NT, or one of the ten categories
        ['tor', 'oth', 'ket', 'map', 'mip', 'gos', 'nth', 'pau', 'epi', 'apo'] (See 
        the package's internal file 'data/book_cat.csv'), default to 0
    :type cat: int or str, optional
    :param chi: The flag indicating whether the text is processed as Chinese (True)
        or English (False), default to False
    :type chi: bool, optional
    :param group: The flag indicating whether the loaded text is grouped by chapter,
        default to True
    :type group: bool, optional
    :param eval: The flag indicating whether the model evaluation results will be shown,
        default to False
    :type eval: bool, optional
    :param web_app: The flag indicating the function is initiated from a web application,
        default to False
    :type web_app: bool
    :return: The pipelined LDA
    :rtype: cwordtm.tm.LDA object
    """

    lda = LDA(doc_file, num_topics, chi)
    if source == 0:
        lda.docs = load_bible(lda.doc_file, cat=cat, group=group)
    else:
        lda.docs = load_text(lda.doc_file, doc_size, text_col)

    print("Corpus loaded!")

    if chi:
        lda.preprocess_chi()
    else:
        lda.preprocess()
    print("Text preprocessed!")

    lda.fit()
    print("Text trained!")
    lda.viz(web_app)
    print("Visualization prepared!")
    lda.show_topics()

    if eval:
        print("\nModel Evaluation Scores:")
        lda.evaluate()

    return lda




[docs]
class NMF:
    """The NMF object for Non-negative Matrix Factorization (NMF) modeling.

    :cvar num_topics: The number of topics to be modeled, default to 10
    :vartype num_topics: int
    :ivar doc_file: The filename of the text file to be processed
    :vartype doc_file: str
    :ivar chi: The flag indicating whether the processed text is in Chinese or not,
        True stands for Traditional Chinese or False for English
    :vartype chi: bool
    :ivar num_topics: The number of topics set for the topic model
    :vartype num_topics: int
    :ivar docs: The collection of the original documents to be processed
    :vartype docs: pandas.DataFrame or list
    :ivar pro_docs: The collection of documents, in form of list of lists of words
        after text preprocessing
    :vartype pro_docs: list
    :ivar dictionary: The dictionary of word ids with their tokenized words
        from preprocessed documents ('pro_docs')
    :vartype dictionary: gensim.corpora.Dictionary
    :ivar corpus: The list of documents, where each document is a list of tuples
        (word id, word frequency in the particular document)
    :vartype corpus: list
    :ivar model: The NMF model object
    :vartype model: gensim.models.Nmf
    :ivar figures: The list of model visualization figures
    :vartype figures: list(matplotlib.pyplot.figure)
    """


[docs]
    def __init__(self, doc_file, num_topics, chi=False):
        """Constructor method.
        """

        self.doc_file = doc_file
        self.num_topics = num_topics
        self.chi = chi
        self.docs = None
        self.pro_docs = None
        self.dictionary = None
        self.corpus = None
        self.model = None
        self.figures = []


    

[docs]
    def preprocess(self):
        """Process the original English documents (cwordtm.tm.NMF.docs)
        by invoking cwordtm.tm.process_text, and build a dictionary
        and a corpus from the preprocessed documents for the NMF model.
        """

        self.pro_docs = [process_text(doc) for doc in self.docs]

        for i, doc in enumerate(self.pro_docs):
            self.pro_docs[i] += ["_".join(w) for w in ngrams(doc, 2)]
            # self.pro_docs[i] += ["_".join(w) for w in ngrams(doc, 3)]

        # Create a dictionary and corpus for the NMF model
        self.dictionary = corpora.Dictionary(self.pro_docs)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.pro_docs]




[docs]
    def preprocess_chi(self):
        """Process the original Chinese documents (cwordtm.tm.NMF.docs) 
        by tokenizing text, removing stopwords, and building a dictionary
        and a corpus from the preprocessed documents for the NMF model.
        """

        # Build stop words
        stop_file = files('cwordtm.data').joinpath("tc_stopwords_2.txt")
        stopwords = [k[:-1] for k in open(stop_file, encoding='utf-8')\
                     .readlines() if k != '']

        # Tokenize the Chinese text using Jieba
        dict_file = files('cwordtm.data').joinpath("user_dict_4.txt")
        jieba.load_userdict(str(dict_file))
        docs = [jieba.cut(doc) for doc in self.docs]

        # Replace special characters
        docs = [[word.replace('\u3000', ' ') for word in doc] \
                                     for doc in docs]

        # Remove stop words
        self.pro_docs = [' '.join([word for word in doc if word not in stopwords]) \
                                        for doc in docs]

        self.pro_docs = [doc.split() for doc in self.pro_docs]

        # Create a dictionary and corpus
        self.dictionary = corpora.Dictionary(self.pro_docs)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.pro_docs]




[docs]
    def fit(self):
        """Build the NMF model with the created corpus and dictionary.
        """

        self.model = models.Nmf(self.corpus,
                                num_topics=self.num_topics)




[docs]
    def show_topics_words(self):
        """Shows the topics with their keywords from the built NMF model.
        """

        print("\nTopics-Words from NMF Model:")
        for topic_id in range(self.num_topics):
            topic_words = self.model.show_topic(topic_id, topn=10)
            print(f"Topic {topic_id+1}:")
            for word_id, prob in topic_words:
                word = self.dictionary[int(word_id)]
                print("%s (%.6f)" %(word, prob))
            print()


    

[docs]
    def viz(self, web_app=False):
        """Plot the topic distributions as a stacked bar chart for the built NMF model.

        :param web_app: The flag indicating the function is initiated from a web
            application, default to False
        :type web_app: bool
        """

        # Build a list of word ids from the built topics
        word_ids = []
        for topic_id in range(self.num_topics):
            topic_words = self.model.show_topic(topic_id, topn=10)
            for word_id, _ in topic_words:
                word_ids.append(int(word_id))

        word_ids = list(set(word_ids))

        # Create a topic distribution table
        topic_dist = np.zeros((self.num_topics, len(word_ids)))
        for topic_id in range(self.num_topics):
            topic_words = self.model.show_topic(topic_id, topn=10)
            for word_id, prob in topic_words:
                topic_dist[topic_id, word_ids.index(int(word_id))] = prob

        # Build a list of distinct words from the word id list
        word_list = []
        for i in range(len(word_ids)):
            word_list.append(self.dictionary[word_ids[i]])

        # Plot the topic distributions
        matplotlib.rcParams['font.family'] = ['Microsoft YaHei']
        fig = plt.figure(figsize=(10, 6))

        bottom = np.zeros(len(word_list))
        for i, topic in enumerate(topic_dist):
            plt.bar(word_list, topic, width=0.8, bottom=bottom, label=f"Topic {i+1}")
            bottom += topic

        plt.xticks(range(len(word_list)), word_list, rotation=90)
        plt.title("Topic Distributions")
        plt.xlabel("Words")
        plt.ylabel("Importance")
        plt.legend(loc="best")
        plt.show()
        if web_app: self.figures.append(fig)




[docs]
    def evaluate(self):
        """Computes and outputs the coherence score, topic diversity,
        and topic size distribution.
        """

        # Compute coherence score
        coherence_model = CoherenceModel(model=self.model,
                                         texts=self.pro_docs,
                                         dictionary=self.dictionary,
                                         coherence='c_v')
        print(f"  Coherence: {coherence_model.get_coherence()}")
        
        # Compute topic diversity
        topic_sizes = [len(self.model[self.corpus[i]]) for i in range(len(self.corpus))]
        total_docs = sum(topic_sizes)
        topic_diversity = sum([(size/total_docs)**2 for size in topic_sizes])
        print(f"  Topic diversity: {topic_diversity}")
        
        # Compute topic size distribution
        # topic_sizes = [len(self.model[self.corpus[i]]) for i in range(len(self.corpus))]
        topic_size_distribution = max(topic_sizes) / sum(topic_sizes)
        print(f"  Topic size distribution: {topic_size_distribution}\n")




[docs]
    def save(self, file):
        """Saves the built NMF model to the specified file.

        :param file: The name of the file to store the built model, default to None
        :type file: str
        """

        if file is None or len(file.strip())==0:
            print("No valid filename has been specifid!")
            return

        base_name = file.split('.')[0]
        model_file = base_name + '_model.gensim'
        dict_file = base_name + '_dictionary.gensim'
        self.model.save(model_file)
        self.dictionary.save(dict_file)
        # corpora.MmCorpus.serialize(base_name+'_corpus.mm', self.corpus)
        print(f"NMF model has been saved: {model_file!r} and {dict_file!r}")




[docs]
    def load(self, file):
        """Loads the stored NMF model from the specified file.

        :param file: The name of the file to be loaded, default to None
        :type file: str
        :return: The loaded NMF model and the loaded dictionary of the NMF's corpus
        :rtype: gensim.models.Nmf, gensim.corpora.Dictionary
        """

        if file is None or len(file.strip())==0:
            print("No valid filename has been specifid!")
            return

        base_name = file.split('.')[0]
        model_file = base_name + '_model.gensim'
        dict_file = base_name + '_dictionary.gensim'
        try:
            loaded_model = models.Nmf.load(model_file)
            loaded_dict = corpora.Dictionary.load(dict_file)
        except:
            print("Moldel file or dictionary file cannot be loaded!")
            return

        return loaded_model, loaded_dict



# End of NMF Class



[docs]
def nmf_process(doc_file, num_topics=10, source=0, text_col='text', doc_size=0, cat=0, chi=False, group=True, eval=False, web_app=False):
    """Pipelines the NMF modeling.

    :param doc_file: The filename of the prescribed text file to be loaded,
        or a BytesIO object from Streamlit's file_uploader, default to None
    :type doc_file: str or io.BytesIO
    :param num_topics: The number of topics to be modeled, default to 10
    :type num_topics: int, optional
    :param source: The source of the prescribed document file ('doc_file'),
        where 0 refers to internal store of the package and 1 to external file,
        default to 0
    :type source: int, optional
    :param text_col: The name of the text column to be extracted, default to 'text'
    :type text_col: str, optional
    :param doc_size: The number of documents to be processed, 0 represents all documents,
        or the range (tuple) of documents to be processed, default to 0
    :type doc_size: int, tuple, optional
    :param cat: The category indicating a subset of the Scripture to be loaded, where
        0 stands for the whole Bible, 1 for OT, 2 for NT, or one of the ten categories
        ['tor', 'oth', 'ket', 'map', 'mip', 'gos', 'nth', 'pau', 'epi', 'apo'] (See 
        the package's internal file 'data/book_cat.csv'), default to 0
    :type cat: int or str, optional
    :param chi: The flag indicating whether the text is processed as Chinese (True)
        or English (False), default to False
    :type chi: bool, optional
    :param group: The flag indicating whether the loaded text is grouped by chapter,
        default to True
    :type group: bool, optional
    :param eval: The flag indicating whether the model evaluation results will be shown,
        default to False
    :type eval: bool, optional
    :param web_app: The flag indicating the function is initiated from a web application,
        default to False
    :type web_app: bool
    :return: The pipelined NMF
    :rtype: cwordtm.tm.NMF object
    """

    nmf = NMF(doc_file, num_topics, chi)
    if source == 0:
        nmf.docs = load_bible(nmf.doc_file, cat=cat, group=group)
    else:
        nmf.docs = load_text(nmf.doc_file, doc_size, text_col)

    print("Corpus loaded!")

    if chi:
        nmf.preprocess_chi()
    else:
        nmf.preprocess()
    print("Text preprocessed!")

    nmf.fit()
    print("Text trained!")
    nmf.show_topics_words()
    nmf.viz(web_app)

    if eval:
        print("\nModel Evaluation Scores:")
        nmf.evaluate()

    return nmf




[docs]
class BTM:
    """The BTM object for BERTopic modeling.

    :cvar num_topics: The number of topics to be modeled, default to 10
    :vartype num_topics: int
    :ivar doc_file: The filename of the text file to be processed
    :vartype doc_file: str
    :ivar chi: The flag indicating whether the processed text is in Chinese or not,
        True stands for Traditional Chinese or False for English
    :vartype chi: bool
    :ivar num_topics: The number of topics set for the topic model
    :vartype num_topics: int
    :ivar docs: The collection of the original documents to be processed
    :vartype docs: pandas.DataFrame or list
    :ivar pro_docs: The collection of documents, in form of list of lists of words
        after text preprocessing
    :vartype pro_docs: list
    :ivar dictionary: The dictionary of word ids with their tokenized words
        from preprocessed documents ('pro_docs')
    :vartype dictionary: gensim.corpora.Dictionary
    :ivar corpus: The list of documents, where each document is a list of tuples
        (word id, word frequency in the particular document)
    :vartype corpus: list
    :ivar model: The BERTopic model object
    :vartype model: bertopic.BERTopic
    :ivar embed: The flag indicating whether the BERTopic model is trained
        with the BERT pretrained model
    :vartype embed: bool
    :ivar bmodel: The BERT pretrained model
    :vartype bmodel: transformers.BertModel
    :ivar bt_vectorizer: The vectorizer extracted from the BERTopic model
        for model evaluation
    :vartype bt_vectorizer: sklearn.feature_extraction.text.CountVectorizer
    :ivar bt_analyzer: The analyzer extracted from the BERTopic model
        for model evaluation
    :vartype bt_analyzer: functools.partial
    :ivar cleaned_docs: The list of documents (string) built by grouping
        the original documents by the topics created from the BERTopic model
    :vartype cleaned_docs: list
    :ivar too_few: The flag indicating whether there are too few documents
        to fit the BERTopic model
    :vartype too_few: bool
    :ivar figures: The list of tuples (figure type, figure) of 
        model visualization figures
    :vartype figures: list(tuple(matplotlib.pyplot.figure))
    """


[docs]
    def __init__(self, doc_file, num_topics, chi=False, embed=True):
        """Constructor method.
        """

        self.doc_file = doc_file
        self.num_topics = num_topics
        self.chi = chi
        self.docs = None
        self.pro_docs = None
        self.dictionary = None
        self.corpus = None
        self.model = None
        self.figures = []

        self.embed = embed
        self.bmodel = None
        self.bt_vectorizer = None
        self.bt_analyzer = None
        self.cleaned_docs = None
        self.too_few = False


    

[docs]
    def preprocess(self):
        """Process the original English documents (cwordtm.tm.BTM.docs)
        by invoking cwordtm.tm.process_text, and build a dictionary and
        a corpus from the preprocessed documents for the BERTopic model.
        """

        self.pro_docs = [process_text(doc) for doc in self.docs]

        for i, doc in enumerate(self.pro_docs):
            self.pro_docs[i] += ["_".join(w) for w in ngrams(doc, 2)]
            # self.pro_docs[i] += ["_".join(w) for w in ngrams(doc, 3)]

        # Create a dictionary and corpus for the BERTopic model
        self.dictionary = corpora.Dictionary(self.pro_docs)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.pro_docs]




[docs]
    def preprocess_chi(self):
        """Process the original Chinese documents (cwordtm.tm.BTM.docs) 
        by tokenizing text, removing stopwords, and building a dictionary
        and a corpus from the preprocessed documents for the BERTopic model.
        """

        # Build stop words
        stop_file = files('cwordtm.data').joinpath("tc_stopwords_2.txt")
        stopwords = [k[:-1] for k in open(stop_file, encoding='utf-8')\
                     .readlines() if k != '']

        # Tokenize the Chinese text using Jieba
        dict_file = files('cwordtm.data').joinpath("user_dict_4.txt")
        jieba.load_userdict(str(dict_file))
        docs = [jieba.cut(doc) for doc in self.docs]

        # Replace special characters
        docs = [[word.replace('\u3000', ' ') for word in doc] \
                                     for doc in docs]

        # Remove stop words
        self.pro_docs = [' '.join([word for word in doc if word not in stopwords]) \
                                        for doc in docs]

        self.pro_docs = [doc.split() for doc in self.pro_docs]

        # Create a dictionary and corpus
        self.dictionary = corpora.Dictionary(self.pro_docs)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in self.pro_docs]




[docs]
    def fit(self):
        """Build the BERTopic model for English text with the created corpus
        and dictionary.
        """

        j_pro_docs = [" ".join(doc) for doc in self.pro_docs]

        if self.embed:
            self.bmodel = BertModel.from_pretrained('bert-base-uncased')
            self.model = BERTopic(language='english', 
                                  calculate_probabilities=True,
                                  embedding_model=self.bmodel,
                                  nr_topics=self.num_topics)
        else:
            self.model = BERTopic(language='english', 
                                  calculate_probabilities=True,
                                  nr_topics=self.num_topics)

        try:
            _, _ = self.model.fit_transform(j_pro_docs)
        except TypeError:
            self.too_few = True
            print("Possibly too few documents for BERTopic modeling!")




[docs]
    def fit_chi(self):
        """Build the BERTopic model for Chinese text with the created corpus
        and dictionary.
        """

        j_pro_docs = [" ".join(doc) for doc in self.pro_docs]

        if self.embed:
            self.bmodel = BertModel.from_pretrained('bert-base-chinese')
            self.model = BERTopic(language='chinese (traditional)', 
                                  calculate_probabilities=True,
                                  embedding_model=self.bmodel,
                                  nr_topics=self.num_topics)
        else:
            self.model = BERTopic(language='chinese (traditional)', 
                                  calculate_probabilities=True,
                                  nr_topics=self.num_topics)

        try:
            _, _ = self.model.fit_transform(j_pro_docs)
        except TypeError:
            self.too_few = True
            print("Possibly too few documents for BERTopic modeling!")




[docs]
    def show_topics(self):
        """Shows the topics with their keywords from the built BERTopic model.
        """

        if self.too_few: return
        print("\nTopics from BERTopic Model:")
        for topic in self.model.get_topic_freq().Topic:
            if topic == -1: continue
            twords = [word for (word, _) in self.model.get_topic(topic)]
            print(f"Topic {topic}: {' | '.join(twords)}")




[docs]
    def pre_evaluate(self):
        """Prepare the original documents per built topic for model evaluation.
        """

        if self.too_few: return
        doc_df = pd.DataFrame({"Document": self.docs,
                       "ID": range(len(self.docs)),
                       "Topic": self.model.topics_})
        documents_per_topic = doc_df.groupby(['Topic'], \
                             as_index=False).agg({'Document': ' '.join})
        self.cleaned_docs = self.model._preprocess_text(\
                              documents_per_topic.Document.values)

        # Extract vectorizer and analyzer from BERTopic
        self.bt_vectorizer = self.model.vectorizer_model
        self.bt_analyzer = self.bt_vectorizer.build_analyzer()




[docs]
    def evaluate(self):
        """Computes and outputs the coherence score.
        """

        if self.too_few: return
        try:
            self.pre_evaluate()

            # Extract features for Topic Coherence evaluation
            # words = self.bt_vectorizer.get_feature_names_out()
            tokens = [self.bt_analyzer(doc) for doc in self.cleaned_docs]

            self.dictionary = corpora.Dictionary(tokens)
            self.corpus = [self.dictionary.doc2bow(doc) for doc in tokens]

            topic_words = [[words for words, _ in self.model.get_topic(topic)] 
                            for topic in range(len(set(self.model.topics_))-1)]

            coherence = CoherenceModel(topics=topic_words, texts=tokens, corpus=self.corpus, 
                                                            dictionary=self.dictionary, coherence='c_v')\
                        .get_coherence()

            if math.isnan(coherence):
                print("** No coherence score computed!")
            else:
                print(f"  Coherence: {coherence}")
        except:
            print("** No coherence score computed!")




[docs]
    def viz(self, web_app=False):
        """Visualize the built BERTopic model through Intertopic Distance Map,
        Topic Word Score Charts, and Topic Similarity Matrix.

        :param web_app: The flag indicating the function is initiated from a web
            application, default to False
        :type web_app: bool
        """

        if self.too_few: return
        print("\nBERTopic Model Visualization:")

        fig = plt.figure(figsize=(10, 8))

        # Intertopic Distance Map
        try:
            fig = self.model.visualize_topics()
            if web_app:
                self.figures.append(("Intertopic Distance Map", fig))
            else:
                fig.show()
        except:
            print("** No Intertopic Distance Map shown for your text!")

        # Visualize Terms (Topic Word Scores)
        try:
            fig = self.model.visualize_barchart()
            if web_app:
                self.figures.append(("Topic Word Scores", fig))
            else:
                fig.show()
        except:
            print("** No chart of Topic Word Scores shown for your text!")

        # Visualize Topic Similarity
        try:
            fig = self.model.visualize_heatmap()
            if web_app:
                self.figures.append(("Topic Similarity", fig))
            else:
                fig.show()
        except:
            print("** No heatmap of Topic Similarity shown for your text!")

        # print("  Commands to show model visualization:")
        # print("      btm.model.visualize_topics()")
        # print("      btm.model.visualize_barchart()")
        # print("      btm.model.visualize_heatmap()")
        print()




[docs]
    def save(self, file):
        """Saves the built BERTopic model to the specified file.

        :param file: The name of the file to store the built model, default to None
        :type file: str
        """

        if file is None or len(file.strip())==0:
            print("No valid filename has been specifid!")
            return

        if file.split('.')[-1] == file:
            file += '.pickle'

        self.model.save(file, serialization="pickle")
        print(f"BERTopic model has been stored in {file!r}.")




[docs]
    def load(self, file):
        """Loads the stored BERTopic model from the specified file.

        :param file: The name of the file to be loaded, default to None
        :type file: str
        :return: The loaded BERTopic model
        :rtype: bertopic._bertopic.BERTopic
        """

        if file is None or len(file.strip())==0:
            print("No valid filename has been specifid!")
            return

        if file.split('.')[-1] == file:
            file += '.pickle'

        return BERTopic.load(file)



# End of BTM Class



[docs]
def btm_process(doc_file, num_topics=10, source=0, text_col='text', doc_size=0, cat=0, chi=False, group=True, eval=False, web_app=False):
    """Pipelines the BERTopic modeling.

    :param doc_file: The filename of the prescribed text file to be loaded,
        or a BytesIO object from Streamlit's file_uploader, default to None
    :type doc_file: str or io.BytesIO
    :param num_topics: The number of topics to be modeled, default to 10
    :type num_topics: int, optional
    :param source: The source of the prescribed document file ('doc_file'),
        where 0 refers to internal store of the package and 1 to external file,
        default to 0
    :type source: int, optional
    :param text_col: The name of the text column to be extracted, default to 'text'
    :type text_col: str, optional
    :param doc_size: The number of documents to be processed, 0 represents all documents,
        or the range (tuple) of documents to be processed, default to 0
    :type doc_size: int, tuple, optional
    :param cat: The category indicating a subset of the Scripture to be loaded, where
        0 stands for the whole Bible, 1 for OT, 2 for NT, or one of the ten categories
        ['tor', 'oth', 'ket', 'map', 'mip', 'gos', 'nth', 'pau', 'epi', 'apo'] (See 
        the package's internal file 'data/book_cat.csv'), default to 0
    :type cat: int or str, optional
    :param chi: The flag indicating whether the text is processed as Chinese (True)
        or English (False), default to False
    :type chi: bool, optional
    :param group: The flag indicating whether the loaded text is grouped by chapter,
        default to True
    :type group: bool, optional
    :param eval: The flag indicating whether the model evaluation results will be shown,
        default to False
    :type eval: bool, optional
    :param web_app: The flag indicating the function is initiated from a web application,
        default to False
    :type web_app: bool
    :return: The pipelined BTM
    :rtype: cwordtm.tm.BTM object
    """

    btm = BTM(doc_file, num_topics, chi)
    if source == 0:
        btm.docs = load_bible(btm.doc_file, cat=cat, group=group)
    else:
        btm.docs = load_text(btm.doc_file, doc_size, text_col)

    print("Corpus loaded!")

    if chi:
        btm.preprocess_chi()
        print("Chinese text preprocessed!")
        btm.fit_chi()
    else:
        btm.preprocess()
        print("Text preprocessed!")
        btm.fit()

    if btm.too_few:
        return

    print("Text trained!")

    btm.show_topics()

    if eval:
        print("\nModel Evaluation Scores:")
        btm.evaluate()

    btm.viz(web_app)

    return btm


# End of cwordtm.tm Module