Source code for cwordtm.ta

# ta.py
#    
# Extractive text summarization of a prescribed range of Scripture
#
# Copyright (c) 2025 CWordTM Project 
# Author: Johnny Cheng <drjohnnycheng@gmail.com>
#
# Updated: 5-Jun-2024 (0.6.4), 31-Oct-2024, 25-Jan-2025 (0.7.4)
#
# URL: https://github.com/drjohnnycheng/cwordtm.git
# For license information, see LICENSE.TXT

import re
import string
import numpy as np
import pandas as pd
from importlib_resources import files
from collections import OrderedDict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

from . import util



[docs]
def split_chi_sentences(text):
    # Define sentence-ending punctuation, including those followed by quotations
    # sentence_endings = r'[。！？](『.*?』)?'
    # sentence_endings = r'(?<=[。！？])(?=([^』]*$))|(?<=』)'
    sentence_endings = r'[。！？]|(?<=』)'
    
    # Use regex to split by sentence endings
    sentences = re.split(sentence_endings, text)

    # Remove any empty strings and strip whitespace
    sentences = [s.strip() for s in sentences if s is not None]
    sentences = list(filter(None, sentences))  # Remove empty strings
    sentences = list(OrderedDict.fromkeys(sentences))  # Remove duplicates
    
    return sentences




[docs]
def get_sentences(docs, lang='en'):
    """Returns the list of sentences tokenized from the collection of documents (df).

    :param docs: The input documents storing the Scripture, default to None
    :type docs: pandas.DataFrame
    :param lang: If the value is 'chi' , the processed language is assumed to be Chinese
        otherwise, it is English, default to 'en'
    :type lang: str, optional
    :return: The list of sentences tokenized from the collection of document
    :rtype: list
    """

    join_str = '' if lang == 'chi' else ' '
    if isinstance(docs, pd.DataFrame):
        text = join_str.join(list(docs.text.astype(str)))
    elif isinstance(docs, pd.Series):
        text = join_str.join(list(docs.astype(str)))
    elif isinstance(docs, list) or isinstance(docs, np.ndarray):
        text = join_str.join(str(doc) for doc in docs)
    else:
        text = docs

    if lang == 'chi':
        text = text.replace('\u3000', '')
        # sentences = text.split('。')
        sentences = split_chi_sentences(text)
    else:
        text = text.replace('\n', '')
        text = re.sub(r'[0-9]', '', text)
        sentences = sent_tokenize(text)
        sentences = [re.sub(r'[.:•]', '', sent) for sent in sentences]
        sentences = [sent for sent in sentences if len(sent) > 10]

    return sentences




[docs]
def get_sent_scores(sentences, diction, sent_len) -> dict:   
    """Returns the dictionary of a list of sentences with their scores 
    computed by their words.

    :param sentences: The list of sentences for computing their scores,
        default to None
    :type sentences: list
    :param diction: The dictionary storing the collection of tokenized words
        with their frequencies
    :type diction: collections.Counter object
    :param sent_len: The maximun number of words in a sentence to be processed,
        default to None
    :type sent_len: int
    :return: The list of sentences tokenized from the collection of document
    :rtype: pandas.DataFrame
    """

    sent_weight = dict()

    for sentence in sentences:
        sent_wordcount_net = 0
        for word_weight in diction:
            if word_weight in sentence.lower():
                sent_wordcount_net += 1
                if sentence[:sent_len] in sent_weight:
                    sent_weight[sentence[:sent_len]] += diction[word_weight]
                else:
                    sent_weight[sentence[:sent_len]] = diction[word_weight]

        if sent_weight != dict() and sent_weight.get(sentence[:sent_len], '') != '' \
            and sent_wordcount_net > 0:
            sent_weight[sentence[:sent_len]] = sent_weight[sentence[:sent_len]] / \
                                                sent_wordcount_net

    return sent_weight




[docs]
def get_summary(sentences, sent_weight, threshold, sent_len):
    """Returns the summary of the collection of sentences.

    :param sentences: The list of target sentences for summarization, default to None
    :type sentences: list
    :param sent_weight: The dictionary of a list of sentences with their scores 
        computed by their words
    :type sent_weight: collections.Counter object
    :param threshold: The minimum value of sentence weight for extracting that sentence
        as part of the final summary, default to None
    :type threshold: float
    :param sent_len: The maximun number of words in a sentence to be processed,
        default to None
    :type sent_len: int
    :return: The list of sentences of the extractive summary
    :rtype: list
    """

    sent_counter = 0
    summary = []

    for sentence in sentences:
        if sentence[:sent_len] in sent_weight and \
            sent_weight[sentence[:sent_len]] >= (threshold):
            summary.append(sentence)

    return summary




[docs]
def summary_chi(docs, weight=1.5, sent_len=8):
    """Returns an extractive summary of a collection of Chinese sentences.

    :param docs: The collection of target documents for summarization,
        default to None
    :type docs: pandas.DataFrame or pandas.Series or numpy.ndarray or list
    :param weight: The factor to be multiplied to the threshold, which 
        determines the sentences as the summary, default to 1.5
    :type weight: float, optional
    :param sent_len: The maximun number of words in a sentence to be processed,
        default to 8
    :type sent_len: int, optional
    :return: The list of sentences of the extractive summary
    :rtype: list
    """

    lang = 'chi'
    util.set_lang(lang)
    diction = util.get_diction(docs)
    sentences = get_sentences(docs, lang)

    sent_scores = get_sent_scores(sentences, diction, sent_len)
    threshold = np.mean(list(sent_scores.values()))
    return get_summary(sentences, sent_scores, weight * threshold, sent_len)




[docs]
def preprocess_sent(text):
    """Preprocesses English text by tokenizing text into sentences of words,
    converting text to lower case, removing stopwords, lemmatize text, and
    tagging text with Part-of-Speech (POS).

    :param text: The text to be preprocessed, default to None
    :type text: str
    :return: The list of preprocessed and tagged sentences (word, pos)
    :rtype: list of tuples (str, str)
    """

    if isinstance(text, list) or isinstance(text, np.ndarray):
        text = ' '.join(text)

    # print("Preprocessing text ...")

    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    
    # Convert text to lowercase and tokenize text into words
    sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    sentences = [[word for word in sentence if word not in stop_words] for sentence in sentences]
    
    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in sentences]
    
    # Tag text with POS
    sentences = [pos_tag(sentence) for sentence in sentences]
    
    return sentences




[docs]
def summary_en(docs, sent_len=8):
    """Returns an extractive summary of a collection of English sentences.

    :param docs: The collection of target documents for summarization,
        default to None
    :type docs: pandas.DataFrame or pandas.Series or numpy.ndarray or list or text
    :param sent_len: The maximun number of words in a sentence to be processed,
        default to 8
    :type sent_len: int, optional
    :return: The list of sentences of the extractive summary
    :rtype: list
    """

    join_str = ' '
    if isinstance(docs, pd.DataFrame):
        text = join_str.join(list(docs.text.astype(str)))
    elif isinstance(docs, pd.Series):
        text = join_str.join(list(docs.astype(str)))
    elif isinstance(docs, list) or isinstance(docs, np.ndarray):
        text = join_str.join(str(doc) for doc in docs)
    else:
        text = docs

    tagged_sentences = preprocess_sent(text)

    # Compute sentence scores
    sentence_scores = []
    for sentence in tagged_sentences:
        score = 0
        for word, pos in sentence[:sent_len]:
            # Filter with nouns and verbs
            if pos.startswith('NN') or pos.startswith('VB'):
                score += 1
        sentence_scores.append(score)

    # Extract top scoring sentences (list of indices of top sentences)
    top_sentences = sorted(range(len(sentence_scores)), \
                           key=lambda i: sentence_scores[i], \
                           reverse=True)

    # Build a summary
    sentences = sent_tokenize(text)
    # summary = ' '.join([sentences[i] for i in top_sentences])
    sm_max = len(top_sentences) // 3
    summary = [sentences[i] for i in top_sentences[:sm_max]]

    return summary