From c138cda73512bf0511339a720d1003d8e68022b8 Mon Sep 17 00:00:00 2001 From: jxxghp Date: Tue, 29 Aug 2023 12:22:14 +0800 Subject: [PATCH] fix #300 --- app/chain/message.py | 2 +- app/utils/string.py | 34 ++++++++++++++++++++++------------ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/app/chain/message.py b/app/chain/message.py index 212ddbda..fdc0e3ec 100644 --- a/app/chain/message.py +++ b/app/chain/message.py @@ -270,7 +270,7 @@ class MessageChain(ChainBase): elif text.startswith("#") \ or re.search(r"^请[问帮你]", text) \ or re.search(r"[??]$", text) \ - or StringUtils.count_words(text) > 10 \ + or StringUtils.count_words(text) > 15 \ or text.find("继续") != -1: # 聊天 content = text diff --git a/app/utils/string.py b/app/utils/string.py index a6d31dde..4bcd683b 100644 --- a/app/utils/string.py +++ b/app/utils/string.py @@ -418,21 +418,31 @@ class StringUtils: return curr + format(amount, ",") @staticmethod - def count_words(s: str) -> int: + def count_words(text: str) -> int: """ - 计算字符串中包含的单词数量,只适用于简单的单行文本 - :param s: 要计算的字符串 - :return: 字符串中包含的单词数量 + 计算字符串中包含的单词或汉字的数量,需要兼容中英文混合的情况 + :param text: 要计算的字符串 + :return: 字符串中包含的词数量 """ - # 匹配英文单词 - if re.match(r'^[A-Za-z0-9\s]+$', s): - # 如果是英文字符串,则按空格分隔单词,并计算单词数量 - num_words = len(s.split()) - else: - # 如果不是英文字符串,则计算字符数量 - num_words = len(s) + if not text: + return 0 + # 使用正则表达式匹配汉字和英文单词 + chinese_pattern = '[\u4e00-\u9fa5]' + english_pattern = '[a-zA-Z]+' - return num_words + # 匹配汉字和英文单词 + chinese_matches = re.findall(chinese_pattern, text) + english_matches = re.findall(english_pattern, text) + + # 过滤掉空格和数字 + chinese_words = [word for word in chinese_matches if word.isalpha()] + english_words = [word for word in english_matches if word.isalpha()] + + # 计算汉字和英文单词的数量 + chinese_count = len(chinese_words) + english_count = len(english_words) + + return chinese_count + english_count @staticmethod def split_text(text: str, max_length: int) -> Generator: