# quot.py
#
# Locate the Scripture in OT quoted from an NT verse
#
# Copyright (c) 2025 CWordTM Project
# Author: Johnny Cheng <drjohnnycheng@gmail.com>
#
# Updated: 16-May-2024 (0.6.4), 27-Oct-2024, 1-Feb-2025 (0.7.4)
#
# URL: https://github.com/drjohnnycheng/cwordtm.git
# For license information, see LICENSE.TXT
import warnings
warnings.filterwarnings("ignore")
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from . import util
# List of Chinese stopwords
chinese_stopwords = ["、","。","〈","〉","《","》","一","一些","一何","一切","一則","一方面","一旦","一來","一樣","一般","一轉眼","七","萬一","三","上","上下","下","不","不僅","不但","不光","不單","不只","不外乎","不如","不妨","不盡","不儘然","不得","不怕","不惟","不成","不拘","不料","不是","不比","不然","不特","不獨","不管","不至於","不若","不論","不過","不問","與","與其","與其說","與否","與此同時","且","且不說","且說","兩者","個","個別","中","臨","為","為了","為什麼","為何","為止","為此","為著","乃","乃至","乃至於","麼","之","之一","之所以","之類","烏乎","乎","乘","九","也","也好","也罷","了","二","二來","於","於是","於是乎","云云","雲爾","五","些","亦","人","人們","人家","什","什麼","什麼樣","今","介於","仍","仍舊","從","從此","從而","他","他人","他們","他們們","以","以上","以為","以便","以免","以及","以故","以期","以來","以至","以至於","以致","們","任","任何","任憑","會","似的","但","但凡","但是","何","何以","何況","何處","何時","餘外","作為","你","你們","使","使得","例如","依","依據","依照","便於","俺","俺們","倘","倘使","倘或","倘然","倘若","借","借儻然","假使","假如","假若","做","像","兒","先不先","光是","全體","全部","八","六","兮","共","關於","關於具體地說","其","其一","其中","其二","其他","其餘","其它","其次","具體地說","具體說來","兼之","內","再","再其次","再則","再有","再者","再者說","再說","冒","沖","況且","幾","幾時","凡","凡是","憑","憑藉","出於","出來","分","分別","則","則甚","別","別人","別處","別是","別的","別管","別說","到","前後","前此","前者","加之","加以","即","即令","即使","即便","即如","即或","即若","卻","去","又","又及","及","及其","及至","反之","反而","反過來","反過來說","受到","另","另一方面","另外","另悉","只","只當","只怕","只是","只有","只消","只要","只限","叫","叮咚","可","可以","可是","可見","各","各個","各位","各種","各自","同","同時","後","後者","向","向使","向著","嚇","嗎","否則","吧","吧噠","含","吱","呀","呃","嘔","唄","嗚","嗚呼","呢","呵","呵呵","呸","呼哧","咋","和","咚","咦","咧","咱","咱們","咳","哇","哈","哈哈","哉","哎","哎呀","哎喲","嘩","喲","哦","哩","哪","哪個","哪些","哪兒","哪天","哪年","哪怕","哪樣","哪邊","哪裡","哼","哼唷","唉","唯有","啊","啐","啥","啦","啪達","啷當","喂","喏","喔唷","嘍","嗡","嗡嗡","呵","嗯","噯","嘎","嘎登","噓","嘛","嘻","嘿","嘿嘿","四","因","因為","因了","因此","因著","因而","固然","在","在下","在於","地","基於","處在","多","多麼","多少","大","大家","她","她們","好","如","如上","如上所述","如下","如何","如其","如同","如是","如果","如此","如若","始而","孰料","孰知","甯","寧可","寧願","寧肯","它","它們","對","對於","對待","對方","對比","將","小","爾","爾後","爾爾","尚且","就","就是","就是了","就是說","就算","就要","盡","儘管","儘管如此","豈但","己","已","已矣","巴","巴巴","年","並","並且","庶乎","庶幾","開外","開始","歸","歸齊","當","當地","當然","當著","彼","彼時","彼此","往","待","很","得","得了","怎","怎麼","怎麼辦","怎麼樣","怎奈","怎樣","總之","總的來看","總的來說","總的說來","總而言之","恰恰相反","您","惟其","慢說","我","我們","或","或則","或是","或曰","或者","截至","所","所以","所在","所幸","所有","才","才能","打","打從","把","抑或","拿","按","按照","換句話說","換言之","據","據此","接著","故","故此","故而","旁人","無","無寧","無論","既","既往","既是","既然","日","時","時候","是","是以","是的","更","曾","替","替代","最","月","有","有些","有關","有及","有時","有的","望","朝","朝著","本","本人","本地","本著","本身","來","來著","來自","來說","極了","果然","果真","某","某個","某些","某某","根據","歟","正值","正如","正巧","正是","此","此地","此處","此外","此時","此次","此間","毋寧","每","每當","比","比及","比如","比方","沒奈何","沿","沿著","漫說","焉","然則","然後","然而","照","照著","猶且","猶自","甚且","甚麼","甚或","甚而","甚至","甚至於","用","用來","由","由於","由是","由此","由此可見","的","的確","的話","直到","相對而言","省得","看","眨眼","著","著呢","矣","矣乎","矣哉","離","秒","竟而","第","等","等到","等等","簡言之","管","類如","緊接著","縱","縱令","縱使","縱然","經","經過","結果","給","繼之","繼後","繼而","綜上所述","罷了","者","而","而且","而況","而後","而外","而已","而是","而言","能","能否","騰","自","自個兒","自從","自各兒","自後","自家","自己","自打","自身","至","至於","至今","至若","致","般的","若","若夫","若是","若果","若非","莫不然","莫如","莫若","雖","雖則","雖然","雖說","被","要","要不","要不是","要不然","要麼","要是","譬喻","譬如","讓","許多","論","設使","設或","設若","誠如","誠然","該","說","說來","請","諸","諸位","諸如","誰","誰人","誰料","誰知","賊死","賴以","趕","起","起見","趁","趁著","越是","距","跟","較","較之","邊","過","還","還是","還有","還要","這","這一來","這個","這麼","這麼些","這麼樣","這麼點兒","這些","這會兒","這兒","這就是說","這時","這樣","這次","這般","這邊","這裡","進而","連","連同","逐步","通過","遵循","遵照","那","那個","那麼","那麼些","那麼樣","那些","那會兒","那兒","那時","那樣","那般","那邊","那裡","都","鄙人","鑒於","針對","阿","除","除了","除外","除開","除此之外","除非","隨","隨後","隨時","隨著","難道說","零","非","非但","非徒","非特","非獨","靠","順","順著","首先","︿","!","#","$","%","&","(",")","*","+",",","0","1","2","3","4","5","6","7","8","9",":",";","<",">","?","@","[","]","{","|","}","~","¥"]
# Bible book categories
book_cat = None
[docs]
def tokenize(sentence):
"""Returns a list of tokens from a Chinese sentence.
:param sentence: The target text to be tokenized, default to None
:type sentence: str
:return: The generator object that storing the list of tokens
extracted from the sentence
:rtype: generator
"""
without_duplicates = re.sub(r'(.)\1+', r'\1\1', sentence)
without_punctuation = re.sub(r'[^\w]','', without_duplicates)
return jieba.cut(without_duplicates)
[docs]
def match_text(target, sent_tokens, lang, threshold, n=5):
"""Returns a list of tuples of the cosine similarity measures of the OT verse
with target verse and the index of that OT verse in the DataFrame storing
the prescribed OT Scripture.
:param target: The target verse to be matched, default to None
:type target: str
:param sent_tokens: The target verse to be matched, default to None
:type sent_tokens: str
:param lang: If the value is 'chi' , the processed language is assumed to be
Chinese, otherwise, it is English, default to None
:type lang: str
:param threshold: The threshold value of the cosine similarity measure
between the target verse and an OT verse, where the cosine similarity measure
of a matched OT verse and the target verse should be greater this value,
default to None
:type threshold: float
:param n: The upper bound of the number of matched verses, default to 5
:type n: int, optional
:return: The list of tuples of the cosine smilarity measure and the index
of the OT verse
:rtype: list
"""
result = ''
sent_tokens.append(target)
if lang == 'chi':
TfidfVec = TfidfVectorizer(tokenizer=tokenize, stop_words=chinese_stopwords)
else:
TfidfVec = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf = TfidfVec.fit_transform(sent_tokens)
vals = cosine_similarity(tfidf[-1], tfidf)
results = []
for i in range(n):
idx = vals.argsort()[0][-2-i]
flat = vals.flatten()
flat.sort()
cos_sim = flat[-2-i]
if (cos_sim > threshold):
results.append((cos_sim, idx))
return results
[docs]
def match_verse(i, ot_list, otdf, df, book, chap, verse, lang, threshold):
"""Returns whether the target NT verse (book, chap, verse) can match a particular verse
in the list of OT verses (ot_list), and prints the matched OT verse(s).
:param i: The number of matched verses so far, default to None
:type i: int
:param ot_list: The list of OT verses (str) to be matched, default to None
:type ot_list: list
:param otdf: The DataFrame storing the prescribed OT verses to be matched,
default to None
:type otdf: pandas.DataFrame
:param df: The DataFrame storing the collection of the target NT verses
to be matched, default to None
:type df: pandas.DataFrame
:param book: The Bible book short name (3 characters) of the target NT verse
to be matched, default to None
:type book: str
:param chap: The chapter number of the target NT verse to be matched,
default to None
:type chap: int
:param verse: The verse number of the target NT verse to be matched,
default to None
:type verse: int
:param lang: If the value is 'chi' , the processed language is assumed to be Chinese
otherwise, it is English, default to None
:type lang: str
:param threshold: The threshold value of the cosine similarity measure
between the target verse and an OT verse, where that measure for successful match
should be greater this value, default to None
:type threshold: float
:return: True if the target verse matched an OT verse, False otherwise
:rtype: bool
"""
global book_cat
if book_cat is None:
book_cat = util.load_word('book_categories.csv')
if lang == 'en':
book_sname = book
quot_marks = ['"', '"']
else:
book_sname = book_cat[book_cat.book_s == book].book_chi.iloc[0]
quot_marks=['『', '』']
averse = util.extract(df, book=book, chapter=chap, verse=verse)
vtext = util.get_text(averse)
quot = extract_quotation(vtext, quot_marks)
if quot == "": return False
nt_str = book_sname + ' ' + str(chap) + ':' + str(verse)
print("(%2d) %-6s %s" %(i+1, nt_str, vtext)) # NT Verse
results = match_text(quot, ot_list, lang, threshold)
ot_list.remove(quot)
for cos_sim, idx in results:
sv = otdf.iloc[idx]
if lang == 'en':
book_sname = sv.book
else:
book_sname = book_cat[book_cat.book_s == sv.book].book_chi.iloc[0]
ot_str = book_sname + ' ' + str(sv.chapter) + ':' + str(sv.verse)
print(" -> %.4f %-9s %s" %(cos_sim, ot_str, sv.text)) # OT Verse
return True
[docs]
def show_quot(target, source='ot', lang='en', threshold=0.5):
"""Shows a collection of matched OT verses, if any, based on the prescribed
collection of target NT verse and the threshold value.
:param target: The collection of target NT verses to be matched, default to None
:type target: pandas.DataFrame
:param source: The string representing the collection of all or subset of OT verses
to be matched, default to 'ot'
:type source: str, optional
:param lang: If the value is 'en', the processed language is assumed to be English
otherwise, it is Chinese, default to 'en
:type lang: str, optional
:param threshold: The threshold value of the cosine similarity measure
between the target verse and an OT verse, where that measure for successful match
should be greater this value, default to 0.5
:type threshold: str, optional
:return: The list of tuples of the cosine smilarity measure and the index
of the OT verse
:rtype: list
"""
util.set_lang(lang)
ot_cat = ['tor', 'oth', 'ket', 'map', 'mip']
if lang == 'en':
df = util.load_word()
else:
df = util.load_word('cuv.csv')
if source in ot_cat:
otdf = util.extract(df, category=source)
else:
otdf = util.extract(df, testament=0)
ot_list = util.get_text_list(otdf)
print("\n*** Quote(s) from OT")
i = 0
for _, row in target.iterrows():
if match_verse(i, ot_list, otdf, target, row.book, row.chapter, row.verse, lang, threshold):
i += 1
print("** End of Quotes **")