# ta.py
#
# Extractive text summarization of a prescribed range of Scripture
#
# Copyright (c) 2025 CWordTM Project
# Author: Johnny Cheng <drjohnnycheng@gmail.com>
#
# Updated: 5-Jun-2024 (0.6.4), 31-Oct-2024, 25-Jan-2025 (0.7.4)
#
# URL: https://github.com/drjohnnycheng/cwordtm.git
# For license information, see LICENSE.TXT
import re
import string
import numpy as np
import pandas as pd
from importlib_resources import files
from collections import OrderedDict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from . import util
[docs]
def split_chi_sentences(text):
# Define sentence-ending punctuation, including those followed by quotations
# sentence_endings = r'[。!?](『.*?』)?'
# sentence_endings = r'(?<=[。!?])(?=([^』]*$))|(?<=』)'
sentence_endings = r'[。!?]|(?<=』)'
# Use regex to split by sentence endings
sentences = re.split(sentence_endings, text)
# Remove any empty strings and strip whitespace
sentences = [s.strip() for s in sentences if s is not None]
sentences = list(filter(None, sentences)) # Remove empty strings
sentences = list(OrderedDict.fromkeys(sentences)) # Remove duplicates
return sentences
[docs]
def get_sentences(docs, lang='en'):
"""Returns the list of sentences tokenized from the collection of documents (df).
:param docs: The input documents storing the Scripture, default to None
:type docs: pandas.DataFrame
:param lang: If the value is 'chi' , the processed language is assumed to be Chinese
otherwise, it is English, default to 'en'
:type lang: str, optional
:return: The list of sentences tokenized from the collection of document
:rtype: list
"""
join_str = '' if lang == 'chi' else ' '
if isinstance(docs, pd.DataFrame):
text = join_str.join(list(docs.text.astype(str)))
elif isinstance(docs, pd.Series):
text = join_str.join(list(docs.astype(str)))
elif isinstance(docs, list) or isinstance(docs, np.ndarray):
text = join_str.join(str(doc) for doc in docs)
else:
text = docs
if lang == 'chi':
text = text.replace('\u3000', '')
# sentences = text.split('。')
sentences = split_chi_sentences(text)
else:
text = text.replace('\n', '')
text = re.sub(r'[0-9]', '', text)
sentences = sent_tokenize(text)
sentences = [re.sub(r'[.:•]', '', sent) for sent in sentences]
sentences = [sent for sent in sentences if len(sent) > 10]
return sentences
[docs]
def get_sent_scores(sentences, diction, sent_len) -> dict:
"""Returns the dictionary of a list of sentences with their scores
computed by their words.
:param sentences: The list of sentences for computing their scores,
default to None
:type sentences: list
:param diction: The dictionary storing the collection of tokenized words
with their frequencies
:type diction: collections.Counter object
:param sent_len: The maximun number of words in a sentence to be processed,
default to None
:type sent_len: int
:return: The list of sentences tokenized from the collection of document
:rtype: pandas.DataFrame
"""
sent_weight = dict()
for sentence in sentences:
sent_wordcount_net = 0
for word_weight in diction:
if word_weight in sentence.lower():
sent_wordcount_net += 1
if sentence[:sent_len] in sent_weight:
sent_weight[sentence[:sent_len]] += diction[word_weight]
else:
sent_weight[sentence[:sent_len]] = diction[word_weight]
if sent_weight != dict() and sent_weight.get(sentence[:sent_len], '') != '' \
and sent_wordcount_net > 0:
sent_weight[sentence[:sent_len]] = sent_weight[sentence[:sent_len]] / \
sent_wordcount_net
return sent_weight
[docs]
def get_summary(sentences, sent_weight, threshold, sent_len):
"""Returns the summary of the collection of sentences.
:param sentences: The list of target sentences for summarization, default to None
:type sentences: list
:param sent_weight: The dictionary of a list of sentences with their scores
computed by their words
:type sent_weight: collections.Counter object
:param threshold: The minimum value of sentence weight for extracting that sentence
as part of the final summary, default to None
:type threshold: float
:param sent_len: The maximun number of words in a sentence to be processed,
default to None
:type sent_len: int
:return: The list of sentences of the extractive summary
:rtype: list
"""
sent_counter = 0
summary = []
for sentence in sentences:
if sentence[:sent_len] in sent_weight and \
sent_weight[sentence[:sent_len]] >= (threshold):
summary.append(sentence)
return summary
[docs]
def summary_chi(docs, weight=1.5, sent_len=8):
"""Returns an extractive summary of a collection of Chinese sentences.
:param docs: The collection of target documents for summarization,
default to None
:type docs: pandas.DataFrame or pandas.Series or numpy.ndarray or list
:param weight: The factor to be multiplied to the threshold, which
determines the sentences as the summary, default to 1.5
:type weight: float, optional
:param sent_len: The maximun number of words in a sentence to be processed,
default to 8
:type sent_len: int, optional
:return: The list of sentences of the extractive summary
:rtype: list
"""
lang = 'chi'
util.set_lang(lang)
diction = util.get_diction(docs)
sentences = get_sentences(docs, lang)
sent_scores = get_sent_scores(sentences, diction, sent_len)
threshold = np.mean(list(sent_scores.values()))
return get_summary(sentences, sent_scores, weight * threshold, sent_len)
[docs]
def preprocess_sent(text):
"""Preprocesses English text by tokenizing text into sentences of words,
converting text to lower case, removing stopwords, lemmatize text, and
tagging text with Part-of-Speech (POS).
:param text: The text to be preprocessed, default to None
:type text: str
:return: The list of preprocessed and tagged sentences (word, pos)
:rtype: list of tuples (str, str)
"""
if isinstance(text, list) or isinstance(text, np.ndarray):
text = ' '.join(text)
# print("Preprocessing text ...")
# Tokenize text into sentences
sentences = sent_tokenize(text)
# Convert text to lowercase and tokenize text into words
sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
# Remove stopwords
stop_words = set(stopwords.words('english'))
sentences = [[word for word in sentence if word not in stop_words] for sentence in sentences]
# Lemmatize text
lemmatizer = WordNetLemmatizer()
sentences = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in sentences]
# Tag text with POS
sentences = [pos_tag(sentence) for sentence in sentences]
return sentences
[docs]
def summary_en(docs, sent_len=8):
"""Returns an extractive summary of a collection of English sentences.
:param docs: The collection of target documents for summarization,
default to None
:type docs: pandas.DataFrame or pandas.Series or numpy.ndarray or list or text
:param sent_len: The maximun number of words in a sentence to be processed,
default to 8
:type sent_len: int, optional
:return: The list of sentences of the extractive summary
:rtype: list
"""
join_str = ' '
if isinstance(docs, pd.DataFrame):
text = join_str.join(list(docs.text.astype(str)))
elif isinstance(docs, pd.Series):
text = join_str.join(list(docs.astype(str)))
elif isinstance(docs, list) or isinstance(docs, np.ndarray):
text = join_str.join(str(doc) for doc in docs)
else:
text = docs
tagged_sentences = preprocess_sent(text)
# Compute sentence scores
sentence_scores = []
for sentence in tagged_sentences:
score = 0
for word, pos in sentence[:sent_len]:
# Filter with nouns and verbs
if pos.startswith('NN') or pos.startswith('VB'):
score += 1
sentence_scores.append(score)
# Extract top scoring sentences (list of indices of top sentences)
top_sentences = sorted(range(len(sentence_scores)), \
key=lambda i: sentence_scores[i], \
reverse=True)
# Build a summary
sentences = sent_tokenize(text)
# summary = ' '.join([sentences[i] for i in top_sentences])
sm_max = len(top_sentences) // 3
summary = [sentences[i] for i in top_sentences[:sm_max]]
return summary