# util.py
#
# Some utility functions including loading Scriptue, setting Scripture language,
# extracting a specifc range of Scripture
#
# Copyright (c) 2025 CWordTM Project
# Author: Johnny Cheng <drjohnnycheng@gmail.com>
#
# Updated: 16-Jun-2024 (0.6.4), 24-Dec-2024, 13-Jan-2025, 28-Jan-2025 (0.7.4)
#
# URL: https://github.com/drjohnnycheng/cwordtm.git
# For license information, see LICENSE.TXT
import re
import string
from io import BytesIO
from io import StringIO
import numpy as np
import pandas as pd
from importlib_resources import files
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import jieba
from collections import Counter
chi_flag = False
glang = 'en'
stops = set()
[docs]
def is_chi():
"""Checks whether the Chinese language flag is set.
:return: True if the Chinese language flag (chi_flag) is set,
False otherwise
:rtype: bool
"""
return chi_flag
[docs]
def bible_cat_info(lang='en'):
"""Prints a table of Bible book categories with their books.
:param lang: The language of the information to be shown, default to "en"
:type lang: str, optional
:return: The table of Bible book categories
:rtype: pandas.DataFrame
"""
if lang not in ['en', 'chi']:
return "The language should be either English ('en') or Chinese ('chi')"
cat_file = 'category_chi.csv' if lang=='chi' else 'category.csv'
cdf = pd.read_csv(files('cwordtm.data').joinpath(cat_file))
return cdf
[docs]
def remove_noise(text, noise_list):
"""Removes a list of substrings in noise_list from the input text.
:param text: The input text, default to None
:type text: str
:param noise_list: The list of substrings to be removed, default to ""
:type noise_list: list, optional
:return: The text with the prescribed substrings removed
:rtype: str
"""
text = text.rstrip()
for noise in noise_list:
text = text.replace(noise, '')
return text
[docs]
def load_csv(file_obj, doc_size=0, info=False):
"""Loads a CSV file with a "text" column.
:param file_obj: The prescribed file path from which the text is loaded,
or a BytesIO object from Streamlit's file_uploader, default to None
:type file_obj: str or io.BytesIO
:param doc_size: The number of documents to be loaded, 0 represents all documents,
or the range (tuple) of documents to be processed, default to 0
:type doc_size: int, tuple, optional
:param info: The flag whether the dataset information is shown,
default to False
:type info: bool, optional
:return: The collection of text with the prescribed number of rows loaded
:rtype: pandas.DataFrame
"""
# print("Loading file '%s' ..." %filepath)
if isinstance(file_obj, BytesIO):
fname = file_obj.name
else:
fname = str(file_obj)
if fname.lower().endswith('csv'):
df = pd.read_csv(file_obj, encoding='utf-8')
else: # text file
if isinstance(file_obj, BytesIO):
stringio = StringIO(file_obj.getvalue().decode("utf-8"))
lines = list(stringio.read().split('\n'))
else:
tf = open(file_obj, encoding='utf-8')
lines = [line.strip() for line in tf.readlines()]
df = pd.DataFrame({'text': lines})
if isinstance(doc_size, int):
if doc_size > 0:
df = df.iloc[:doc_size]
elif isinstance(doc_size, tuple):
df = df.iloc[doc_size[0]-1:doc_size[1]]
noise_list = ['\u3000', '─ ', '•']
for noise in noise_list:
df['text'] = df['text'].str.replace(noise, '')
if info:
print("\nDataset Information:")
df.info()
return df
[docs]
def load_text(file_obj, doc_size=0, info=False):
"""Loads and returns the text from the prescribed file path.
:param file_obj: The prescribed file path from which the text is loaded,
or a BytesIO object from Streamlit's file_uploader, default to None
:type file_obj: str or io.BytesIO
:param doc_size: The number of documents to be loaded, 0 represents all documents,
or the range (tuple) of documents to be processed, default to 0
:type doc_size: int, tuple, optional
:param info: The flag whether the dataset information is shown,
default to False
:type info: bool, optional
:return: The collection of text with the prescribed number of rows loaded
:rtype: pandas.DataFrame
"""
if isinstance(file_obj, BytesIO):
stringio = StringIO(file_obj.getvalue().decode("utf-8"))
lines = list(stringio.read().split('\n'))
else:
tf = open(file_obj, encoding='utf-8')
lines = [line.strip() for line in tf.readlines()]
df = pd.DataFrame({'text': lines})
if isinstance(doc_size, int):
if doc_size > 0:
df = df.iloc[:doc_size]
elif isinstance(doc_size, tuple):
df = df.iloc[doc_size[0]-1:doc_size[1]]
noise_list = ['\u3000', '─ ', '•']
for noise in noise_list:
df['text'] = df['text'].str.replace(noise, '')
if info:
print("\nDataset Information:")
df.info()
return df
[docs]
def load_word(ver='web.csv', nr=0, info=False):
"""Loads and returns the text from the prescribed internal file ('ver').
:param ver: The package's internal Bible text from which the text is loaded,
either World English Bible ('web.csv') or Chinese Union Version
(Traditional)('cuv.csv'), default to 'web.csv'
:type ver: str, optional
:param nr: The number of rows of Scripture to be loaded; 0 represents all rows,
default to 0
:type nr: int, optional
:param info: The flag whether the dataset information is shown,
default to False
:type info: bool, optional
:return: The collection of Scripture with the prescribed number of rows loaded
:rtype: pandas.DataFrame
"""
scfile = files('cwordtm.data').joinpath(ver)
print("Loading file '%s' ..." %scfile)
df = pd.read_csv(scfile)
if nr > 0:
print("Initial Records:")
df.head(int(nr))
if info:
print("\nDataset Information:")
df.info()
return df
[docs]
def group_text(df, column='chapter'):
"""Groups the Bible Scripture in the DataFrame 'df' by the prescribed column, and
'df' should include columns 'book', 'book_no', 'chapter', 'verse', 'text',
'testament', 'category', 'cat', and 'cat_no'.
:param df: The input DataFrame storing the Scripture, default to None
:type df: pandas.DataFrame
:param column: The column by which the Scriture is grouped, default to 'chapter'
:type column: str, optional
:return: The grouped Scripture
:rtype: pandas.DataFrame
"""
gdf = df.groupby(['book_no', column])\
.agg({'text': lambda x: ''.join(x)})\
.reset_index()
return gdf
[docs]
def get_list(df, column='book'):
"""Extracts and returns the prescribed column from the Scripture
stored in the DataFrame 'df'.
:param df: The input DataFrame storing the Scripture, default to None
:type df: pandas.DataFrame
:param column: The column by which the Scriture is grouped, default to 'book'
:type column: str, optional
:return: The grouped Scripture
:rtype: pandas.DataFrame
"""
if column in list(df.columns):
return list(df[column].unique())
else:
return "No such column!"
[docs]
def get_text(df, text_col='text'):
"""Extracts and returns the text from a DataFrame
stored in the DataFrame 'df' after joining the list of text into a string
and removing all the ideographic spaces ('\u3000') from the text.
:param df: The input DataFrame storing the Scripture, default to None
:type df: pandas.DataFrame
:param text_col: The name of the text column to be extracted, default to 'text'
:type text_col: str, optional
:return: The extracted text
:rtype: str
"""
# return ' '.join(list(df[text_col])).replace('\u3000', '')
return ' '.join(list(df[text_col].astype(str))).replace('\u3000', '')
[docs]
def get_text_list(df, text_col='text'):
"""Extracts and returns the list of text from a DataFrame
stored in the DataFrame 'df' after removing all the ideographic spaces
('\u3000') from the text.
:param df: The input DataFrame storing the Scripture, default to None
:type df: pandas.DataFrame
:param text_col: The name of the text column to be extracted, default to 'text'
:type text_col: str, optional
:return: The extracted text
:rtype: list
"""
return df[text_col].apply(lambda x: x.replace('\u3000', '')).tolist()
[docs]
def clean_text(df, text_col='text'):
"""Cleans the text from the Scripture stored in the DataFrame 'df',
by removing all digits, replacing newline by a space, removing
English stopwords, converting all characters to lower case, and
removing all characters except alphanumeric and whitespace.
:param df: The input DataFrame storing the Scripture, default to None
:type df: pandas.DataFrame
:param text_col: The name of the text column to be extracted, default to 'text'
:type text_col: str, optional
:return: The cleaned text in a DataFrame
:rtype: pandas.DataFrame
"""
df[text_col] = [re.sub(r'\d+', '', str(v).replace('\n', ' ')) for v in df[text_col]]
for sw in stopwords.words('english'):
df[text_col] = [v.replace(' ' + sw + ' ', ' ') for v in df[text_col]]
df[text_col] = df[text_col].apply(lambda v: " ".join(w.lower() for w in v.split()))
df[text_col] = df[text_col].str.replace('[^\w\s]', '', regex=True)
return df
[docs]
def clean_sentences(sentences):
"""Cleans the list of sentences by invoking the function preprocess_text.
:param sentences: The list of sentences to be cleaned, default to None
:type sentences: list
:return: The list of cleaned sentences
:rtype: list
"""
cleaned = []
for sentence in sentences:
cleaned_sent = preprocess_text(sentence)
if len(cleaned_sent) > 0:
cleaned.append(cleaned_sent)
return cleaned
[docs]
def preprocess_text(text):
"""Preprocesses English text by converting text to lower case, removing
special characters and digits, removing punctuations, removing stopwords,
removing short words, and Lemmatize text.
:param text: The text to be preprocessed, default to None
:type text: str
:return: The preprocessed text
:rtype: str
"""
if isinstance(text, list) or isinstance(text, np.ndarray):
text = ' '.join(str(item) for item in text)
elif isinstance(text, pd.Series):
text = ' '.join(list(text.astype(str)))
# print("Preprocessing text ...")
# Convert text to lowercase
text = text.lower()
# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Remove stopwords
text = " ".join([word for word in nltk.word_tokenize(text) \
if word.lower() not in stopwords.words('english')])
# Remove short words (length < 3)
text = " ".join([word for word in nltk.word_tokenize(text) if len(word) >= 3])
# Lemmatization
lemmatizer = WordNetLemmatizer()
text = " ".join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)])
return text
[docs]
def add_chi_vocab():
"""Loads the Chinese Bible vocabulary from the internal file 'bible_vocab.txt',
and adds to the Jieba word list for future tokenization
"""
vocab_file = files('cwordtm.data').joinpath('bible_vocab.txt')
print("Loading Chinese vocabulary '%s' ..." %vocab_file)
with open(vocab_file, 'r', encoding='utf8') as f:
vocab_list = f.readlines()
for vocab in vocab_list:
jieba.add_word(vocab.replace('\n', ''), freq=1000)
[docs]
def chi_stops():
"""Loads the common Chinese (Traditional) vocabulary to Jieba for
future tokenization, and the Chinese stopwords for future
wordcloud plotting.
:return: The list of stopwords for wordcloud plotting
:rtype: list
"""
dict_file = files('cwordtm.dictionary').joinpath('dict.txt.big.txt')
cloud_file = files('cwordtm.dictionary').joinpath('stopWord_cloudmod.txt')
jieba.set_dictionary(dict_file)
with open(cloud_file, 'r', encoding='utf-8-sig') as f:
return f.read().split('\n')
[docs]
def set_lang(lang='en'):
"""Sets the prescribed language (English or Chinese (Traditional))
for further text processing.
:param lang: The prescribed language for text processing, where
'en' stands for English or 'chi' for Traditonal Chinese,
default to 'en'
:type lang: str, optional
"""
global glang, stops
glang = lang
if glang == 'en': # English
stops = set(stopwords.words("english"))
else: # Chinese (Traditional)
add_chi_vocab()
stops = chi_stops()
chi_flag = True
[docs]
def get_diction_en(docs):
"""Tokenizes the collection of English documents and builds a dictionary
of words with their frequencies.
:param docs: The collection of text, default to None
:type docs: pandas.DataFrame or list
:return: The dictionary of words with their frequencies
:rtype: dict
"""
if isinstance(docs, pd.DataFrame):
docs = ' '.join(list(docs.text.astype(str)))
elif isinstance(docs, pd.Series):
docs = ' '.join(list(docs.astype(str)))
elif isinstance(docs, list) or isinstance(docs, np.ndarray):
docs = ' '.join(str(doc) for doc in docs)
words = word_tokenize(docs)
stem = PorterStemmer()
terms = []
for t in words:
t = stem.stem(t)
if t not in stops:
terms.append(t)
diction = Counter(terms)
return diction
[docs]
def get_diction_chi(docs):
"""Tokenizes the collection of Chinese documents and builds a dictionary
of words with their frequencies.
:param docs: The collection of documents, default to None
:type docs: pandas.DataFrame or list
:return: The dictionary of words with their frequencies
:rtype: dict
"""
if isinstance(docs, pd.DataFrame):
docs = ''.join(list(docs.text.astype(str)))
elif isinstance(docs, pd.Series):
docs = ''.join(list(docs.astype(str)))
elif isinstance(docs, list) or isinstance(docs, np.ndarray):
docs = ''.join(str(doc) for doc in docs)
text = docs.replace('\u3000', '')
text = re.sub("[、.。,!?『』「」〔〕]", "", text)
terms = []
for t in jieba.cut(text, cut_all=False):
if t not in stops:
terms.append(t)
diction = Counter(terms)
return diction
[docs]
def get_diction(docs):
"""Determines which is the target language, English or Chinese,
in order to build a dictionary of words with their frequencies.
:param docs: The collection of documents, default to None
:type docs: pandas.DataFrame or list
:return: The dictionary of words with their frequencies
:rtype: dict
"""
if glang == 'en':
return get_diction_en(docs)
else:
return get_diction_chi(docs)
[docs]
def chi_sent_terms(text):
"""Returns the list of Chinese words tokenized from the input text.
:param text: The input Chinese text to be tokenized, default to None
:type text: str
:return: The list of Chinese words
:rtype: list
"""
text = re.sub("[、.。,:!?『』「」〔〕]", "", text)
terms = []
for t in jieba.cut(text, cut_all=False):
if t not in stops:
terms.append(t)
return terms
[docs]
def get_sent_terms(text):
"""Determines how to tokenize the input text, based on the global language
setting, either English ('en') or Traditional Chinese ('chi').
:param text: The input text to be tokenized, default to None
:type text: str
:return: The list of tokenized words
:rtype: list
"""
if glang == 'en':
return word_tokenize(text)
else:
return chi_sent_terms(text)
[docs]
def set_rows(n=None):
"""Set the maximum no. of rows of DataFrames to be displayed.
:param n: The maximum no. of rows to be set, value None denotes that
all rows are to be displayed, default to None
:type n: int, optional
"""
pd.options.display.max_rows = n
[docs]
def reset_rows():
"""Reset the maximum no. of rows of DataFrames to be displayed to its default value.
"""
pd.reset_option("display.max_rows")