Source code for cwordtm.util

# util.py
#    
# Some utility functions including loading Scriptue, setting Scripture language,
#   extracting a specifc range of Scripture
#
# Copyright (c) 2025 CWordTM Project 
# Author: Johnny Cheng <drjohnnycheng@gmail.com>
#
# Updated: 16-Jun-2024 (0.6.4), 24-Dec-2024, 13-Jan-2025, 28-Jan-2025 (0.7.4)
#
# URL: https://github.com/drjohnnycheng/cwordtm.git
# For license information, see LICENSE.TXT


import re
import string
from io import BytesIO
from io import StringIO
import numpy as np
import pandas as pd
from importlib_resources import files

import nltk
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import jieba
from collections import Counter


chi_flag = False
glang = 'en'
stops = set()



[docs]
def is_chi():
    """Checks whether the Chinese language flag is set.

    :return: True if the Chinese language flag (chi_flag) is set,
        False otherwise
    :rtype: bool
    """

    return chi_flag




[docs]
def bible_cat_info(lang='en'):
    """Prints a table of Bible book categories with their books.

    :param lang: The language of the information to be shown, default to "en"
    :type lang: str, optional
    :return: The table of Bible book categories
    :rtype: pandas.DataFrame
    """

    if lang not in ['en',  'chi']:
        return  "The language should be either English ('en') or Chinese ('chi')"

    cat_file = 'category_chi.csv' if lang=='chi' else 'category.csv'
    cdf = pd.read_csv(files('cwordtm.data').joinpath(cat_file))
    return cdf




[docs]
def remove_noise(text, noise_list):
    """Removes a list of substrings in noise_list from the input text.

    :param text: The input text, default to None
    :type text: str
    :param noise_list: The list of substrings to be removed, default to ""
    :type noise_list: list, optional
    :return: The text with the prescribed substrings removed
    :rtype: str
    """

    text = text.rstrip()
    for noise in noise_list:
        text = text.replace(noise, '')
    return text




[docs]
def load_csv(file_obj, doc_size=0, info=False):
    """Loads a CSV file with a "text" column.

    :param file_obj: The prescribed file path from which the text is loaded,
        or a BytesIO object from Streamlit's file_uploader, default to None
    :type file_obj: str or io.BytesIO
    :param doc_size: The number of documents to be loaded, 0 represents all documents,
        or the range (tuple) of documents to be processed, default to 0
    :type doc_size: int, tuple, optional
    :param info: The flag whether the dataset information is shown,
        default to False
    :type info: bool, optional
    :return: The collection of text with the prescribed number of rows loaded
    :rtype: pandas.DataFrame
    """

    # print("Loading file '%s' ..." %filepath)
    if isinstance(file_obj, BytesIO):
        fname = file_obj.name
    else:
        fname = str(file_obj)
    
    if fname.lower().endswith('csv'):
        df = pd.read_csv(file_obj, encoding='utf-8')
    else:  # text file
        if isinstance(file_obj, BytesIO):
            stringio = StringIO(file_obj.getvalue().decode("utf-8"))
            lines = list(stringio.read().split('\n'))        
        else:
            tf = open(file_obj, encoding='utf-8')
            lines = [line.strip() for line in tf.readlines()]

        df = pd.DataFrame({'text': lines})

    if isinstance(doc_size, int):
        if doc_size > 0:
            df = df.iloc[:doc_size]
    elif isinstance(doc_size, tuple):
        df = df.iloc[doc_size[0]-1:doc_size[1]]

    noise_list = ['\u3000', '─ ', '•']
    for noise in noise_list:
        df['text'] = df['text'].str.replace(noise, '')

    if info:
        print("\nDataset Information:")
        df.info()

    return df




[docs]
def load_text(file_obj, doc_size=0, info=False):
    """Loads and returns the text from the prescribed file path.

    :param file_obj: The prescribed file path from which the text is loaded,
        or a BytesIO object from Streamlit's file_uploader, default to None
    :type file_obj: str or io.BytesIO
    :param doc_size: The number of documents to be loaded, 0 represents all documents,
        or the range (tuple) of documents to be processed, default to 0
    :type doc_size: int, tuple, optional
    :param info: The flag whether the dataset information is shown,
        default to False
    :type info: bool, optional
    :return: The collection of text with the prescribed number of rows loaded
    :rtype: pandas.DataFrame
    """

    if isinstance(file_obj, BytesIO):
        stringio = StringIO(file_obj.getvalue().decode("utf-8"))
        lines = list(stringio.read().split('\n'))        
    else:
        tf = open(file_obj, encoding='utf-8')
        lines = [line.strip() for line in tf.readlines()]

    df = pd.DataFrame({'text': lines})

    if isinstance(doc_size, int):
        if doc_size > 0:
            df = df.iloc[:doc_size]
    elif isinstance(doc_size, tuple):
        df = df.iloc[doc_size[0]-1:doc_size[1]]

    noise_list = ['\u3000', '─ ', '•']
    for noise in noise_list:
        df['text'] = df['text'].str.replace(noise, '')

    if info:
        print("\nDataset Information:")
        df.info()

    return df




[docs]
def load_word(ver='web.csv', nr=0, info=False):
    """Loads and returns the text from the prescribed internal file ('ver').

    :param ver: The package's internal Bible text from which the text is loaded,
        either World English Bible ('web.csv') or Chinese Union Version
        (Traditional)('cuv.csv'), default to 'web.csv'
    :type ver: str, optional
    :param nr: The number of rows of Scripture to be loaded; 0 represents all rows,
        default to 0
    :type nr: int, optional
    :param info: The flag whether the dataset information is shown,
        default to False
    :type info: bool, optional
    :return: The collection of Scripture with the prescribed number of rows loaded
    :rtype: pandas.DataFrame
    """

    scfile = files('cwordtm.data').joinpath(ver)
    print("Loading file '%s' ..." %scfile)
    df = pd.read_csv(scfile)
    if nr > 0:
       print("Initial Records:")
       df.head(int(nr))
    if info:
        print("\nDataset Information:")
        df.info()
    return df




[docs]
def group_text(df, column='chapter'):
    """Groups the Bible Scripture in the DataFrame 'df' by the prescribed column, and
    'df' should include columns 'book', 'book_no', 'chapter', 'verse', 'text',
    'testament', 'category', 'cat', and 'cat_no'.

    :param df: The input DataFrame storing the Scripture, default to None
    :type df: pandas.DataFrame
    :param column: The column by which the Scriture is grouped, default to 'chapter'
    :type column: str, optional
    :return: The grouped Scripture
    :rtype: pandas.DataFrame
    """

    gdf = df.groupby(['book_no', column])\
                        .agg({'text': lambda x: ''.join(x)})\
                .reset_index()
    return gdf




[docs]
def get_list(df, column='book'):
    """Extracts and returns the prescribed column from the Scripture
    stored in the DataFrame 'df'.

    :param df: The input DataFrame storing the Scripture, default to None
    :type df: pandas.DataFrame
    :param column: The column by which the Scriture is grouped, default to 'book'
    :type column: str, optional
    :return: The grouped Scripture
    :rtype: pandas.DataFrame
    """

    if column in list(df.columns):
        return list(df[column].unique())
    else:
        return "No such column!"




[docs]
def get_text(df, text_col='text'):
    """Extracts and returns the text from a DataFrame
    stored in the DataFrame 'df' after joining the list of text into a string
    and removing all the ideographic spaces ('\u3000') from the text.

    :param df: The input DataFrame storing the Scripture, default to None
    :type df: pandas.DataFrame
    :param text_col: The name of the text column to be extracted, default to 'text'
    :type text_col: str, optional
    :return: The extracted text
    :rtype: str
    """

    # return ' '.join(list(df[text_col])).replace('\u3000', '')
    return ' '.join(list(df[text_col].astype(str))).replace('\u3000', '')




[docs]
def get_text_list(df, text_col='text'):
    """Extracts and returns the list of text from a DataFrame
    stored in the DataFrame 'df' after removing all the ideographic spaces
    ('\u3000') from the text.

    :param df: The input DataFrame storing the Scripture, default to None
    :type df: pandas.DataFrame
    :param text_col: The name of the text column to be extracted, default to 'text'
    :type text_col: str, optional
    :return: The extracted text
    :rtype: list
    """

    return df[text_col].apply(lambda x: x.replace('\u3000', '')).tolist()




[docs]
def clean_text(df, text_col='text'):
    """Cleans the text from the Scripture stored in the DataFrame 'df',
    by removing all digits, replacing newline by a space, removing
    English stopwords, converting all characters to lower case, and
    removing all characters except alphanumeric and whitespace.

    :param df: The input DataFrame storing the Scripture, default to None
    :type df: pandas.DataFrame
    :param text_col: The name of the text column to be extracted, default to 'text'
    :type text_col: str, optional
    :return: The cleaned text in a DataFrame
    :rtype: pandas.DataFrame
    """

    df[text_col] = [re.sub(r'\d+', '', str(v).replace('\n', ' ')) for v in df[text_col]]
    for sw in stopwords.words('english'):
        df[text_col] = [v.replace(' ' + sw + ' ', ' ') for v in df[text_col]]

    df[text_col] = df[text_col].apply(lambda v: " ".join(w.lower() for w in v.split()))
    df[text_col] = df[text_col].str.replace('[^\w\s]', '', regex=True)
    return df




[docs]
def clean_sentences(sentences):
    """Cleans the list of sentences by invoking the function preprocess_text.

    :param sentences: The list of sentences to be cleaned, default to None
    :type sentences: list
    :return: The list of cleaned sentences
    :rtype: list
    """

    cleaned = []
    for sentence in sentences:
        cleaned_sent = preprocess_text(sentence)
        if len(cleaned_sent) > 0:
            cleaned.append(cleaned_sent)

    return cleaned




[docs]
def preprocess_text(text):
    """Preprocesses English text by converting text to lower case, removing 
    special characters and digits, removing punctuations, removing stopwords,
    removing short words, and Lemmatize text.

    :param text: The text to be preprocessed, default to None
    :type text: str
    :return: The preprocessed text
    :rtype: str
    """

    if isinstance(text, list) or isinstance(text, np.ndarray):
        text = ' '.join(str(item) for item in text)
    elif isinstance(text, pd.Series):
        text = ' '.join(list(text.astype(str)))

    # print("Preprocessing text ...")

    # Convert text to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove stopwords
    text = " ".join([word for word in nltk.word_tokenize(text) \
                    if word.lower() not in stopwords.words('english')])

    # Remove short words (length < 3)
    text = " ".join([word for word in nltk.word_tokenize(text) if len(word) >= 3])

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)])

    return text




[docs]
def add_chi_vocab():
    """Loads the Chinese Bible vocabulary from the internal file 'bible_vocab.txt',
    and adds to the Jieba word list for future tokenization
    """

    vocab_file = files('cwordtm.data').joinpath('bible_vocab.txt')
    print("Loading Chinese vocabulary '%s' ..." %vocab_file)
    with open(vocab_file, 'r', encoding='utf8') as f:
        vocab_list = f.readlines()
        for vocab in vocab_list:
            jieba.add_word(vocab.replace('\n', ''), freq=1000)




[docs]
def chi_stops():
    """Loads the common Chinese (Traditional) vocabulary to Jieba for
    future tokenization, and the Chinese stopwords for future
    wordcloud plotting.

    :return: The list of stopwords for wordcloud plotting
    :rtype: list
    """

    dict_file = files('cwordtm.dictionary').joinpath('dict.txt.big.txt')
    cloud_file = files('cwordtm.dictionary').joinpath('stopWord_cloudmod.txt')
    jieba.set_dictionary(dict_file)
    with open(cloud_file, 'r', encoding='utf-8-sig') as f:
        return f.read().split('\n')




[docs]
def set_lang(lang='en'):
    """Sets the prescribed language (English or Chinese (Traditional)) 
    for further text processing.

    :param lang: The prescribed language for text processing, where
        'en' stands for English or 'chi' for Traditonal Chinese,
        default to 'en'
    :type lang: str, optional
    """

    global glang, stops
    glang = lang
    if glang == 'en':  # English
        stops = set(stopwords.words("english"))
    else:  # Chinese (Traditional)
        add_chi_vocab()
        stops = chi_stops()
        chi_flag = True




[docs]
def get_diction_en(docs):
    """Tokenizes the collection of English documents and builds a dictionary
    of words with their frequencies.

    :param docs: The collection of text, default to None
    :type docs: pandas.DataFrame or list
    :return: The dictionary of words with their frequencies
    :rtype: dict
    """

    if isinstance(docs, pd.DataFrame):
        docs = ' '.join(list(docs.text.astype(str)))
    elif isinstance(docs, pd.Series):
        docs = ' '.join(list(docs.astype(str)))
    elif isinstance(docs, list) or isinstance(docs, np.ndarray):
        docs = ' '.join(str(doc) for doc in docs)

    words = word_tokenize(docs)
    stem = PorterStemmer()
    
    terms = []
    for t in words:
        t = stem.stem(t)
        if t not in stops:
            terms.append(t)

    diction = Counter(terms)
    return diction




[docs]
def get_diction_chi(docs):
    """Tokenizes the collection of Chinese documents and builds a dictionary
    of words with their frequencies.

    :param docs: The collection of documents, default to None
    :type docs: pandas.DataFrame or list
    :return: The dictionary of words with their frequencies
    :rtype: dict
    """

    if isinstance(docs, pd.DataFrame):
        docs = ''.join(list(docs.text.astype(str)))
    elif isinstance(docs, pd.Series):
        docs = ''.join(list(docs.astype(str)))
    elif isinstance(docs, list) or isinstance(docs, np.ndarray):
        docs = ''.join(str(doc) for doc in docs)

    text = docs.replace('\u3000', '')
    text = re.sub("[、．。，！？『』「」〔〕]", "", text)

    terms = []
    for t in jieba.cut(text, cut_all=False):
        if t not in stops:
            terms.append(t)

    diction = Counter(terms)
    return diction




[docs]
def get_diction(docs):
    """Determines which is the target language, English or Chinese,
    in order to build a dictionary of words with their frequencies.

    :param docs: The collection of documents, default to None
    :type docs: pandas.DataFrame or list
    :return: The dictionary of words with their frequencies
    :rtype: dict
    """

    if glang == 'en':
        return get_diction_en(docs)
    else:
        return get_diction_chi(docs)




[docs]
def chi_sent_terms(text):
    """Returns the list of Chinese words tokenized from the input text.

    :param text: The input Chinese text to be tokenized, default to None
    :type text: str
    :return: The list of Chinese words
    :rtype: list
    """

    text = re.sub("[、．。，：！？『』「」〔〕]", "", text)
    terms = []
    for t in jieba.cut(text, cut_all=False):
        if t not in stops:
            terms.append(t)
    return terms




[docs]
def get_sent_terms(text):
    """Determines how to tokenize the input text, based on the global language
    setting, either English ('en') or Traditional Chinese ('chi').

    :param text: The input text to be tokenized, default to None
    :type text: str
    :return: The list of tokenized words
    :rtype: list
    """

    if glang == 'en':
        return word_tokenize(text)
    else:
        return chi_sent_terms(text)




[docs]
def extract(df, testament=-1, category='', book=0, chapter=0, verse=0):
    """Extracts a subset of the Scripture stored in a DataFrame by testament,
    category, or book/chapter/verse.

    :param df: The collection of the Bible Scripture with columns 'book',
        'book_no', 'chapter', 'verse', 'text', 'testament', 'category',
        'cat', and 'cat_no', default to None
    :type df: pandas.DataFrame
    :param testament: The prescribed testament to be extracted,
        -1 stands for no prescription, 0 for OT, or 1 for NT,
        default to -1
    :type testament: int, optional
    :param category: The prescribed category to be extracted, and
        it should be either a full category name or a short name with
        3 lower-case letters from a list of 10 categories, default to ''
    :type category: str, optional
    :param book: The prescribed Bible book to be extracted, and
        it should be either a 3-letter short book name or a book number
        from 1 to 66, default to 0
    :type book: str, int, optional
    :param chapter: The prescribed chapter or a tuple indicating the range of
        chapters of a Bible book to be extracted, default to 0
    :type chapter: int or tuple, optional
    :param verse: The prescribed verse or a tuple indicating the range of verses
        from a chapter of a Bible book to be extracted, default to 0
    :type verse: int or tuple, optional
    :return: The subset of the input Scripture, if any, otherwise,
        the message 'No scripture is extracted!'
    :rtype: pandas.DataFrame or str
    """

    no_ret = "No scripture is extracted!"
    sub_df = pd.DataFrame()  # Empty DataFrame
    isbook = ischapter = False

    if (testament > -1) & (testament < 2):
        sub_df = df[df.testament==int(testament)]
    elif category != '':
        if category in get_list(df, column='category'):
            sub_df = df[df.category==category]
        elif category in get_list(df, column='cat'):
            sub_df = df[df.cat==category]
    elif book in get_list(df, column='book'):
        sub_df = df[df.book==book]
        isbook = True
    elif isinstance(book, int):
        if book > 0 & book < 67:
            sub_df = df[df.book_no==book]
            isbook = True
    elif isinstance(book, tuple):
        if (book[0] <= book[1]) & (book[0] > 0) & (book[1] < 67):
            sub_df = df[(df.book_no >= book[0]) & (df.book_no <= book[1])]
            isbook = True

    if isbook & (len(sub_df) > 0) & (chapter != 0):
        if isinstance(chapter, int):
            sub_df = sub_df[sub_df.chapter==chapter]
            ischapter = True
        elif isinstance(chapter, tuple):
            if chapter[0] <= chapter[1]:
                sub_df = sub_df[(sub_df.chapter >= chapter[0]) & (sub_df.chapter <= chapter[1])]
                ischapter = True

        if ischapter & (len(sub_df) > 0) & (verse != 0):
            if isinstance(verse, int):
                sub_df = sub_df[sub_df.verse==verse]
            elif isinstance(verse, tuple):
                if verse[0] <= verse[1]:
                    sub_df = sub_df[(sub_df.verse >= verse[0]) & (sub_df.verse <= verse[1])]

    if len(sub_df) > 0:
        return sub_df.copy()
    else:
        return no_ret




[docs]
def extract2(df, filter=''):
    """Extracts a subset of the Scripture through a specific filter string by
    invoking the function 'util.extract'.

    :param df: The collection of the Bible Scripture, default to None
    :type df: pandas.DataFrame
    :param filter: The prescribed filter string with the format
        '<book> <chapter>:<verse>[-<verse2>]' for extracting a range of verses
        in the Scripture, default to ''
    :type filter: str, optional
    :return: The prescribed range of verses from the input Scripture, or
        the whole Scripture if the filter string is empty
    :rtype: pandas.DataFrame
    """

    chapter = verse = 0

    if filter == '':
        return df
    else:
        parts = filter.split()
        book = parts[0]
        if len(parts) > 1:
            parts = parts[1].split(':')
            if parts[0] == '':
                chapter = 0
            else:
                chapter = int(parts[0])

            if (len(parts) > 1):
                if (parts[1] != ''):
                    parts = parts[1].split('-')
                    if parts[0] == '':
                        verse = 1
                    else:
                        verse = int(parts[0])

                    if (len(parts) > 1):
                        if (parts[1] == ''):
                            verse = (verse, 999)
                        else:
                            verse = (verse, int(parts[1]))

        return extract(df, book=book, chapter=chapter, verse=verse)




[docs]
def set_rows(n=None):
    """Set the maximum no. of rows of DataFrames to be displayed.

    :param n: The maximum no. of rows to be set, value None denotes that
        all rows are to be displayed, default to None
    :type n: int, optional
    """

    pd.options.display.max_rows = n




[docs]
def reset_rows():
    """Reset the maximum no. of rows of DataFrames to be displayed to its default value.
    """

    pd.reset_option("display.max_rows")