feat 支持自定义词表

2023-07-17 13:05:06 +08:00
parent 5f613874db
commit 2b9c4b081e
9 changed files with 163 additions and 57 deletions
--- a/app/chain/message.py
+++ b/app/chain/message.py
@@ -151,6 +151,7 @@ class MessageChain(ChainBase):
                              f"{self._current_meta.sea} 媒体库中已存在",
                        userid=userid))
                    return
+                # 添加订阅，状态为N
                self.subscribechain.add(title=mediainfo.title,
                                        year=mediainfo.year,
                                        mtype=mediainfo.type,
@@ -182,7 +183,7 @@ class MessageChain(ChainBase):
                    else:
                        # 未完成下载
                        logger.info(f'{self._current_media.title_year} 未下载未完整，添加订阅 ...')
-                        # 添加订阅
+                        # 添加订阅，状态为R
                        self.subscribechain.add(title=self._current_media.title,
                                                year=self._current_media.year,
                                                mtype=self._current_media.type,
@@ -190,7 +191,8 @@ class MessageChain(ChainBase):
                                                season=self._current_meta.begin_season,
                                                channel=channel,
                                                userid=userid,
-                                                username=username)
+                                                username=username,
+                                                state="R")
                else:
                    # 下载种子
                    context: Context = cache_list[int(text) - 1]
@@ -203,13 +205,13 @@ class MessageChain(ChainBase):
            if not cache_data:
                # 没有缓存
                self.post_message(Notification(
-                        channel=channel, title="输入有误！", userid=userid))
+                    channel=channel, title="输入有误！", userid=userid))
                return

            if self._current_page == 0:
                # 第一页
                self.post_message(Notification(
-                        channel=channel, title="已经是第一页了！", userid=userid))
+                    channel=channel, title="已经是第一页了！", userid=userid))
                return
            cache_type: str = cache_data.get('type')
            cache_list: list = cache_data.get('items')
@@ -242,7 +244,7 @@ class MessageChain(ChainBase):
            if not cache_data:
                # 没有缓存
                self.post_message(Notification(
-                        channel=channel, title="输入有误！", userid=userid))
+                    channel=channel, title="输入有误！", userid=userid))
                return
            cache_type: str = cache_data.get('type')
            cache_list: list = cache_data.get('items')
@@ -253,7 +255,7 @@ class MessageChain(ChainBase):
            if not cache_list:
                # 没有数据
                self.post_message(Notification(
-                        channel=channel, title="已经是最后一页了！", userid=userid))
+                    channel=channel, title="已经是最后一页了！", userid=userid))
                return
            else:
                if cache_type == "Torrent":
@@ -282,12 +284,12 @@ class MessageChain(ChainBase):
            # 识别
            if not meta.name:
                self.post_message(Notification(
-                        channel=channel, title="无法识别输入内容！", userid=userid))
+                    channel=channel, title="无法识别输入内容！", userid=userid))
                return
            # 开始搜索
            if not medias:
                self.post_message(Notification(
-                        channel=channel, title=f"{meta.name} 没有找到对应的媒体信息！", userid=userid))
+                    channel=channel, title=f"{meta.name} 没有找到对应的媒体信息！", userid=userid))
                return
            logger.info(f"搜索到 {len(medias)} 条相关媒体信息")
            # 记录当前状态
--- a/app/core/meta/metaanime.py
+++ b/app/core/meta/metaanime.py
@@ -2,7 +2,7 @@ import re
 import zhconv
 import anitopy
 from app.core.meta.metabase import MetaBase
-from app.core.meta.release_groups import ReleaseGroupsMatcher
+from app.core.meta.releasegroup import ReleaseGroupsMatcher
 from app.utils.string import StringUtils
 from app.schemas.types import MediaType

--- a/app/core/meta/metabase.py
+++ b/app/core/meta/metabase.py
@@ -15,7 +15,9 @@ class MetaBase(object):
    """
    # 是否处理的文件
    isfile: bool = False
-    # 原字符串
+    # 原标题字符串
+    title: str = ""
+    # 识别用字符串
    org_string: Optional[str] = None
    # 副标题
    subtitle: Optional[str] = None
@@ -53,6 +55,8 @@ class MetaBase(object):
    video_encode: Optional[str] = None
    # 音频编码
    audio_encode: Optional[str] = None
+    # 应用的识别词信息
+    apply_words: Optional[List[str]] = None

    # 副标题解析
    _subtitle_flag = False
--- a/app/core/meta/metavideo.py
+++ b/app/core/meta/metavideo.py
@@ -3,7 +3,7 @@ from pathlib import Path

 from app.core.config import settings
 from app.core.meta.metabase import MetaBase
-from app.core.meta.release_groups import ReleaseGroupsMatcher
+from app.core.meta.releasegroup import ReleaseGroupsMatcher
 from app.utils.string import StringUtils
 from app.utils.tokens import Tokens
 from app.schemas.types import MediaType
--- a/app/core/meta/release_groups.py
+++ b/app/core/meta/release_groups.py
@@ -1,5 +1,7 @@
 import regex as re

+from app.db.systemconfig_oper import SystemConfigOper
+from app.schemas.types import SystemConfigKey
 from app.utils.singleton import Singleton


@@ -8,8 +10,7 @@ class ReleaseGroupsMatcher(metaclass=Singleton):
    识别制作组、字幕组
    """
    __release_groups: str = None
-    custom_release_groups: str = None
-    custom_separator: str = None
+    # 内置组
    RELEASE_GROUPS: dict = {
        "0ff": ['FF(?:(?:A|WE)B|CD|E(?:DU|B)|TV)'],
        "1pt": [],
@@ -74,6 +75,7 @@ class ReleaseGroupsMatcher(metaclass=Singleton):
    }

    def __init__(self):
+        self.systemconfig = SystemConfigOper()
        release_groups = []
        for site_groups in self.RELEASE_GROUPS.values():
            for release_group in site_groups:
@@ -89,8 +91,10 @@ class ReleaseGroupsMatcher(metaclass=Singleton):
        if not title:
            return ""
        if not groups:
-            if self.custom_release_groups:
-                groups = f"{self.__release_groups}|{self.custom_release_groups}"
+            # 自定义组
+            custom_release_groups = self.systemconfig.get(SystemConfigKey.CustomReleaseGroups)
+            if custom_release_groups:
+                groups = f"{self.__release_groups}|{custom_release_groups}"
            else:
                groups = self.__release_groups
        title = f"{title} "
@@ -100,12 +104,4 @@ class ReleaseGroupsMatcher(metaclass=Singleton):
        for item in re.findall(groups_re, title):
            if item not in unique_groups:
                unique_groups.append(item)
-        separator = self.custom_separator or "@"
-        return separator.join(unique_groups)
-
-    def update_custom(self, release_groups: str = None, separator: str = None):
-        """
-        更新自定义制作组/字幕组，自定义分隔符
-        """
-        self.custom_release_groups = release_groups
-        self.custom_separator = separator
+        return "@".join(unique_groups)
--- a/app/core/meta/words.py
+++ b/app/core/meta/words.py
@@ -0,0 +1,118 @@
+from typing import List, Tuple
+
+import cn2an
+import regex as re
+
+from app.db.systemconfig_oper import SystemConfigOper
+from app.log import logger
+from app.schemas.types import SystemConfigKey
+from app.utils.singleton import Singleton
+
+
+class WordsMatcher(metaclass=Singleton):
+
+    def __init__(self):
+        self.systemconfig = SystemConfigOper()
+
+    def prepare(self, title: str) -> Tuple[str, List[str]]:
+        """
+        预处理标题，支持三种格式
+        1：屏蔽词
+        2：被替换词 => 替换词
+        3：前定位词 <> 后定位词 >> 偏移量（EP）
+        """
+        appley_words = []
+        # 读取自定义识别词
+        words: List[str] = self.systemconfig.get(SystemConfigKey.CustomIdentifiers) or []
+        for word in words:
+            if not word:
+                continue
+            try:
+                if word.count(" => "):
+                    # 替换词
+                    strings = word.split(" => ")
+                    title, message, state = self.__replace_regex(title, strings[0], strings[1])
+                elif word.count(" >> ") and word.count(" <> "):
+                    # 集偏移
+                    strings = word.split(" <> ")
+                    offsets = strings[1].split(" >> ")
+                    title, message, state = self.__episode_offset(title, strings[0], strings[1],
+                                                                  offsets[1])
+                else:
+                    # 屏蔽词
+                    title, message, state = self.__replace_regex(title, word, "")
+
+                if state:
+                    appley_words.append(word)
+                else:
+                    logger.error(f"自定义识别词替换失败：{message}")
+            except Exception as err:
+                print(str(err))
+
+        return title, appley_words
+
+    @staticmethod
+    def __replace_regex(title: str, replaced: str, replace: str) -> Tuple[str, str, bool]:
+        """
+        正则替换
+        """
+        try:
+            if not re.findall(r'%s' % replaced, title):
+                return title, "", False
+            else:
+                return re.sub(r'%s' % replaced, r'%s' % replace, title), "", True
+        except Exception as err:
+            print(str(err))
+            return title, str(err), False
+
+    @staticmethod
+    def __episode_offset(title: str, front: str, back: str, offset: str) -> Tuple[str, str, bool]:
+        """
+        集数偏移
+        """
+        try:
+            if back and not re.findall(r'%s' % back, title):
+                return title, "", False
+            if front and not re.findall(r'%s' % front, title):
+                return title, "", False
+            offset_word_info_re = re.compile(r'(?<=%s.*?)[0-9一二三四五六七八九十]+(?=.*?%s)' % (front, back))
+            episode_nums_str = re.findall(offset_word_info_re, title)
+            if not episode_nums_str:
+                return title, "", False
+            episode_nums_offset_str = []
+            offset_order_flag = False
+            for episode_num_str in episode_nums_str:
+                episode_num_int = int(cn2an.cn2an(episode_num_str, "smart"))
+                offset_caculate = offset.replace("EP", str(episode_num_int))
+                episode_num_offset_int = int(eval(offset_caculate))
+                # 向前偏移
+                if episode_num_int > episode_num_offset_int:
+                    offset_order_flag = True
+                # 向后偏移
+                elif episode_num_int < episode_num_offset_int:
+                    offset_order_flag = False
+                # 原值是中文数字，转换回中文数字，阿拉伯数字则还原0的填充
+                if not episode_num_str.isdigit():
+                    episode_num_offset_str = cn2an.an2cn(episode_num_offset_int, "low")
+                else:
+                    count_0 = re.findall(r"^0+", episode_num_str)
+                    if count_0:
+                        episode_num_offset_str = f"{count_0[0]}{episode_num_offset_int}"
+                    else:
+                        episode_num_offset_str = str(episode_num_offset_int)
+                episode_nums_offset_str.append(episode_num_offset_str)
+            episode_nums_dict = dict(zip(episode_nums_str, episode_nums_offset_str))
+            # 集数向前偏移，集数按升序处理
+            if offset_order_flag:
+                episode_nums_list = sorted(episode_nums_dict.items(), key=lambda x: x[1])
+            # 集数向后偏移，集数按降序处理
+            else:
+                episode_nums_list = sorted(episode_nums_dict.items(), key=lambda x: x[1], reverse=True)
+            for episode_num in episode_nums_list:
+                episode_offset_re = re.compile(
+                    r'(?<=%s.*?)%s(?=.*?%s)' % (front, episode_num[0], back))
+                title = re.sub(episode_offset_re, r'%s' % episode_num[1], title)
+            return title, "", True
+        except Exception as err:
+            print(str(err))
+            return title, str(err), False
--- a/app/core/metainfo.py
+++ b/app/core/metainfo.py
@@ -3,27 +3,37 @@ from pathlib import Path
 import regex as re

 from app.core.config import settings
-from app.core.meta import MetaAnime, MetaVideo
+from app.core.meta import MetaAnime, MetaVideo, MetaBase
+from app.core.meta.words import WordsMatcher


-def MetaInfo(title: str, subtitle: str = None):
+def MetaInfo(title: str, subtitle: str = None) -> MetaBase:
    """
    媒体整理入口，根据名称和副标题，判断是哪种类型的识别，返回对应对象
    :param title: 标题、种子名、文件名
    :param subtitle: 副标题、描述
    :return: MetaAnime、MetaVideo
    """
-
+    # 原标题
+    org_title = title
+    # 预处理标题
+    title, apply_words = WordsMatcher().prepare(title)
    # 判断是否处理文件
    if title and Path(title).suffix.lower() in settings.RMT_MEDIAEXT:
        isfile = True
    else:
        isfile = False
+    # 识别
+    meta = MetaAnime(title, subtitle, isfile) if is_anime(title) else MetaVideo(title, subtitle, isfile)
+    # 记录原标题
+    meta.title = org_title
+    #  记录使用的识别词
+    meta.apply_words = apply_words or []

-    return MetaAnime(title, subtitle, isfile) if is_anime(title) else MetaVideo(title, subtitle, isfile)
+    return meta


-def is_anime(name: str):
+def is_anime(name: str) -> bool:
    """
    判断是否为动漫
    :param name: 名称
--- a/app/modules/words/init.py
+++ b/app/modules/words/init.py
@@ -1,28 +0,0 @@
-from typing import Tuple, Union
-
-from app.modules import _ModuleBase
-
-
-class WordsModule(_ModuleBase):
-    """
-    字幕下载模块
-    """
-
-    def init_module(self) -> None:
-        pass
-
-    def init_setting(self) -> Tuple[str, Union[str, bool]]:
-        pass
-
-    def stop(self) -> None:
-        pass
-
-    def prepare_recognize(self, title: str,
-                          subtitle: str = None) -> Tuple[str, str]:
-        """
-        处理各类特别命名，以便识别
-        :param title:     标题
-        :param subtitle:  副标题
-        :return: 处理后的标题、副标题，该方法可被多个模块同时处理
-        """
-        pass
--- a/app/schemas/types.py
+++ b/app/schemas/types.py
@@ -42,6 +42,10 @@ class SystemConfigKey(Enum):
    TorrentsPriority = "TorrentsPriority"
    # 通知消息渠道设置
    NotificationChannels = "NotificationChannels"
+    # 自定义制作组/字幕组
+    CustomReleaseGroups = "CustomReleaseGroups"
+    # 自定义识别词
+    CustomIdentifiers = "CustomIdentifiers"


 # 站点框架