feat：使用站点种子归类优化识别匹配

fix：优先级规则复杂时，过滤时间很长，调整到最后 fix #1432
2024-02-29 17:18:01 +08:00
parent 2e661f8759
commit 5dd36e95e0
5 changed files with 127 additions and 54 deletions
--- a/app/chain/search.py
+++ b/app/chain/search.py
@@ -135,28 +135,8 @@ class SearchChain(ChainBase):
        if not torrents:
            logger.warn(f'{keyword or mediainfo.title} 未搜索到资源')
            return []
-        # 过滤种子
-        if priority_rule is None:
-            # 取搜索优先级规则
-            priority_rule = self.systemconfig.get(SystemConfigKey.SearchFilterRules)
-        if priority_rule:
-            logger.info(f'开始过滤资源，当前规则：{priority_rule} ...')
-            result: List[TorrentInfo] = self.filter_torrents(rule_string=priority_rule,
-                                                             torrent_list=torrents,
-                                                             season_episodes=season_episodes,
-                                                             mediainfo=mediainfo)
-            if result is not None:
-                torrents = result
-            if not torrents:
-                logger.warn(f'{keyword or mediainfo.title} 没有符合优先级规则的资源')
-                return []
-        # 使用过滤规则再次过滤
-        torrents = self.filter_torrents_by_rule(torrents=torrents,
-                                                mediainfo=mediainfo,
-                                                filter_rule=filter_rule)
-        if not torrents:
-            logger.warn(f'{keyword or mediainfo.title} 没有符合过滤规则的资源')
-            return []
+        # 开始新进度
+        self.progress.start(ProgressKey.Search)
        # 匹配的资源
        _match_torrents = []
        # 总数
@@ -164,29 +144,32 @@ class SearchChain(ChainBase):
        # 已处理数
        _count = 0
        if mediainfo:
-            self.progress.start(ProgressKey.Search)
-            logger.info(f'开始匹配，总 {_total} 个资源 ...')
            # 英文标题应该在别名/原标题中，不需要再匹配
            logger.info(f"标题：{mediainfo.title}，原标题：{mediainfo.original_title}，别名：{mediainfo.names}")
            self.progress.update(value=0, text=f'开始匹配，总 {_total} 个资源 ...', key=ProgressKey.Search)
            for torrent in torrents:
                _count += 1
-                self.progress.update(value=(_count / _total) * 100,
+                self.progress.update(value=(_count / _total) * 96,
                                     text=f'正在匹配 {torrent.site_name}，已完成 {_count} / {_total} ...',
                                     key=ProgressKey.Search)
                # 比对IMDBID
                if torrent.imdbid \
                        and mediainfo.imdb_id \
                        and torrent.imdbid == mediainfo.imdb_id:
-                    logger.info(f'{mediainfo.title} 匹配到资源：{torrent.site_name} - {torrent.title}')
+                    logger.info(f'{mediainfo.title} 通过IMDBID匹配到资源：{torrent.site_name} - {torrent.title}')
                    _match_torrents.append(torrent)
                    continue
                # 识别
                torrent_meta = MetaInfo(title=torrent.title, subtitle=torrent.description)
-                # 比对类型
-                if (torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV) \
-                        or (torrent_meta.type != MediaType.TV and mediainfo.type == MediaType.TV):
-                    logger.warn(f'{torrent.site_name} - {torrent.title} 类型不匹配')
+                # 比对种子识别类型
+                if torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV:
+                    logger.warn(f'{torrent.site_name} - {torrent.title} 种子标题类型为 {torrent_meta.type.value}，'
+                                f'需要是 {mediainfo.type.value}，不匹配')
+                    continue
+                # 比对种子在站点中的类型
+                if torrent.category == MediaType.TV.value and mediainfo.type != MediaType.TV:
+                    logger.warn(f'{torrent.site_name} - {torrent.title} 种子在站点中归类为 {torrent.category}，'
+                                f'需要是 {mediainfo.type.value}，不匹配')
                    continue
                # 比对年份
                if mediainfo.year:
@@ -231,21 +214,55 @@ class SearchChain(ChainBase):
                        break
                else:
                    logger.warn(f'{torrent.site_name} - {torrent.title} 标题不匹配')
-            self.progress.update(value=100,
+            logger.info(f"匹配完成，共匹配到 {len(_match_torrents)} 个资源")
+            self.progress.update(value=97,
                                 text=f'匹配完成，共匹配到 {len(_match_torrents)} 个资源',
                                 key=ProgressKey.Search)
-            self.progress.end(ProgressKey.Search)
        else:
            _match_torrents = torrents
-        logger.info(f"匹配完成，共匹配到 {len(_match_torrents)} 个资源")
+        # 开始过滤
+        self.progress.update(value=98, text=f'开始过滤，总 {len(_match_torrents)} 个资源，请稍候...',
+                             key=ProgressKey.Search)
+        # 过滤种子
+        if priority_rule is None:
+            # 取搜索优先级规则
+            priority_rule = self.systemconfig.get(SystemConfigKey.SearchFilterRules)
+        if priority_rule:
+            logger.info(f'开始优先级规则过滤，当前规则：{priority_rule} ...')
+            result: List[TorrentInfo] = self.filter_torrents(rule_string=priority_rule,
+                                                             torrent_list=_match_torrents,
+                                                             season_episodes=season_episodes,
+                                                             mediainfo=mediainfo)
+            if result is not None:
+                _match_torrents = result
+            if not _match_torrents:
+                logger.warn(f'{keyword or mediainfo.title} 没有符合优先级规则的资源')
+                return []
+        # 使用过滤规则再次过滤
+        if filter_rule:
+            logger.info(f'开始过滤规则过滤，当前规则：{filter_rule} ...')
+            _match_torrents = self.filter_torrents_by_rule(torrents=_match_torrents,
+                                                           mediainfo=mediainfo,
+                                                           filter_rule=filter_rule)
+            if not _match_torrents:
+                logger.warn(f'{keyword or mediainfo.title} 没有符合过滤规则的资源')
+                return []
        # 去掉mediainfo中多余的数据
        mediainfo.clear()
        # 组装上下文
        contexts = [Context(meta_info=MetaInfo(title=torrent.title, subtitle=torrent.description),
                            media_info=mediainfo,
                            torrent_info=torrent) for torrent in _match_torrents]
+
+        logger.info(f"过滤完成，剩余 {_total} 个资源")
+        self.progress.update(value=99, text=f'过滤完成，剩余 {_total} 个资源', key=ProgressKey.Search)
        # 排序
+        self.progress.update(value=100,
+                             text=f'正在对 {len(contexts)} 个资源进行排序，请稍候...',
+                             key=ProgressKey.Search)
        contexts = self.torrenthelper.sort_torrents(contexts)
+        # 结束进度
+        self.progress.end(ProgressKey.Search)
        # 返回
        return contexts

--- a/app/chain/subscribe.py
+++ b/app/chain/subscribe.py
@@ -566,12 +566,14 @@ class SubscribeChain(ChainBase):
                        if torrent_mediainfo.douban_id \
                                and torrent_mediainfo.douban_id != mediainfo.douban_id:
                            continue
-                        logger.info(f'{mediainfo.title_year} 通过媒体信息匹配到资源：{torrent_info.site_name} - {torrent_info.title}')
+                        logger.info(f'{mediainfo.title_year} 通过媒体信ID匹配到资源：{torrent_info.site_name} - {torrent_info.title}')
                    else:
                        # 按标题匹配
-                        # 比对类型
-                        if (torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV) \
-                                or (torrent_meta.type != MediaType.TV and mediainfo.type == MediaType.TV):
+                        # 比对种子识别类型
+                        if torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV:
+                            continue
+                        # 比对种子在站点中的类型
+                        if torrent_info.category == MediaType.TV.value and mediainfo.type != MediaType.TV:
                            continue
                        # 比对年份
                        if mediainfo.year:
--- a/app/chain/torrents.py
+++ b/app/chain/torrents.py
@@ -16,7 +16,7 @@ from app.helper.sites import SitesHelper
 from app.helper.torrent import TorrentHelper
 from app.log import logger
 from app.schemas import Notification
-from app.schemas.types import SystemConfigKey, MessageChannel, NotificationType
+from app.schemas.types import SystemConfigKey, MessageChannel, NotificationType, MediaType
 from app.utils.singleton import Singleton
 from app.utils.string import StringUtils

@@ -184,6 +184,10 @@ class TorrentsChain(ChainBase, metaclass=Singleton):
                    logger.info(f'处理资源：{torrent.title} ...')
                    # 识别
                    meta = MetaInfo(title=torrent.title, subtitle=torrent.description)
+                    # 使用站点种子分类，校正类型识别
+                    if meta.type != MediaType.TV \
+                            and torrent.category == MediaType.TV.value:
+                        meta.type = MediaType.TV
                    # 识别媒体信息
                    mediainfo: MediaInfo = self.mediachain.recognize_by_meta(meta)
                    if not mediainfo:
--- a/app/core/context.py
+++ b/app/core/context.py
@@ -57,6 +57,8 @@ class TorrentInfo:
    labels: list = field(default_factory=list)
    # 种子优先级
    pri_order: int = 0
+    # 种子分类 电影/电视剧
+    category: str = None

    def __setattr__(self, name: str, value: Any):
        self.__dict__[name] = value
--- a/app/modules/indexer/spider.py
+++ b/app/modules/indexer/spider.py
@@ -3,7 +3,7 @@ import datetime
 import re
 import traceback
 from typing import List
-from urllib.parse import quote, urlencode
+from urllib.parse import quote, urlencode, urlparse, parse_qs

 import chardet
 from jinja2 import Template
@@ -276,7 +276,7 @@ class TorrentSpider:
        return self.parse(page_source)

    def __get_title(self, torrent):
-        # title default
+        # title default text
        if 'title' not in self.fields:
            return
        selector = self.fields.get('title', {})
@@ -306,7 +306,7 @@ class TorrentSpider:
                                                         selector.get('filters'))

    def __get_description(self, torrent):
-        # title optional
+        # title optional text
        if 'description' not in self.fields:
            return
        selector = self.fields.get('description', {})
@@ -352,7 +352,7 @@ class TorrentSpider:
                                                               selector.get('filters'))

    def __get_detail(self, torrent):
-        # details
+        # details page text
        if 'details' not in self.fields:
            return
        selector = self.fields.get('details', {})
@@ -373,7 +373,7 @@ class TorrentSpider:
                self.torrents_info['page_url'] = detail_link

    def __get_download(self, torrent):
-        # download link
+        # download link text
        if 'download' not in self.fields:
            return
        selector = self.fields.get('download', {})
@@ -403,7 +403,7 @@ class TorrentSpider:
                                                          selector.get('filters'))

    def __get_size(self, torrent):
-        # torrent size
+        # torrent size int
        if 'size' not in self.fields:
            return
        selector = self.fields.get('size', {})
@@ -420,7 +420,7 @@ class TorrentSpider:
            self.torrents_info['size'] = 0

    def __get_leechers(self, torrent):
-        # torrent leechers
+        # torrent leechers int
        if 'leechers' not in self.fields:
            return
        selector = self.fields.get('leechers', {})
@@ -438,7 +438,7 @@ class TorrentSpider:
            self.torrents_info['peers'] = 0

    def __get_seeders(self, torrent):
-        # torrent leechers
+        # torrent leechers int
        if 'seeders' not in self.fields:
            return
        selector = self.fields.get('seeders', {})
@@ -456,7 +456,7 @@ class TorrentSpider:
            self.torrents_info['seeders'] = 0

    def __get_grabs(self, torrent):
-        # torrent grabs
+        # torrent grabs int
        if 'grabs' not in self.fields:
            return
        selector = self.fields.get('grabs', {})
@@ -474,7 +474,7 @@ class TorrentSpider:
            self.torrents_info['grabs'] = 0

    def __get_pubdate(self, torrent):
-        # torrent pubdate
+        # torrent pubdate yyyy-mm-dd hh:mm:ss
        if 'date_added' not in self.fields:
            return
        selector = self.fields.get('date_added', {})
@@ -486,7 +486,7 @@ class TorrentSpider:
                                                           selector.get('filters'))

    def __get_date_elapsed(self, torrent):
-        # torrent pubdate
+        # torrent data elaspsed text
        if 'date_elapsed' not in self.fields:
            return
        selector = self.fields.get('date_elapsed', {})
@@ -498,7 +498,7 @@ class TorrentSpider:
                                                                selector.get('filters'))

    def __get_downloadvolumefactor(self, torrent):
-        # downloadvolumefactor
+        # downloadvolumefactor int
        selector = self.fields.get('downloadvolumefactor', {})
        if not selector:
            return
@@ -521,7 +521,7 @@ class TorrentSpider:
                    self.torrents_info['downloadvolumefactor'] = int(downloadvolumefactor.group(1))

    def __get_uploadvolumefactor(self, torrent):
-        # uploadvolumefactor
+        # uploadvolumefactor int
        selector = self.fields.get('uploadvolumefactor', {})
        if not selector:
            return
@@ -544,7 +544,7 @@ class TorrentSpider:
                    self.torrents_info['uploadvolumefactor'] = int(uploadvolumefactor.group(1))

    def __get_labels(self, torrent):
-        # labels
+        # labels ['label1', 'label2']
        if 'labels' not in self.fields:
            return
        selector = self.fields.get('labels', {})
@@ -557,7 +557,7 @@ class TorrentSpider:
            self.torrents_info['labels'] = []

    def __get_free_date(self, torrent):
-        # free date
+        # free date yyyy-mm-dd hh:mm:ss
        if 'freedate' not in self.fields:
            return
        selector = self.fields.get('freedate', {})
@@ -569,7 +569,7 @@ class TorrentSpider:
                                                            selector.get('filters'))

    def __get_hit_and_run(self, torrent):
-        # hitandrun
+        # hitandrun True/False
        if 'hr' not in self.fields:
            return
        selector = self.fields.get('hr', {})
@@ -579,28 +579,71 @@ class TorrentSpider:
        else:
            self.torrents_info['hit_and_run'] = False

+    def __get_category(self, torrent):
+        # category 电影/电视剧
+        if 'category' not in self.fields:
+            return
+        selector = self.fields.get('category', {})
+        category = torrent(selector.get('selector', '')).clone()
+        self.__remove(category, selector)
+        items = self.__attribute_or_text(category, selector)
+        category_value = self.__index(items, selector)
+        category_value = self.__filter_text(category_value,
+                                            selector.get('filters'))
+        if category_value and self.category:
+            tv_cats = [str(cat.get("id")) for cat in self.category.get("tv") or []]
+            movie_cats = [str(cat.get("id")) for cat in self.category.get("movie") or []]
+            if category_value in tv_cats \
+                    and category_value not in movie_cats:
+                self.torrents_info['category'] = MediaType.TV.value
+            elif category_value in movie_cats:
+                self.torrents_info['category'] = MediaType.MOVIE.value
+            else:
+                self.torrents_info['category'] = MediaType.UNKNOWN.value
+        else:
+            self.torrents_info['category'] = MediaType.UNKNOWN.value
+
    def get_info(self, torrent) -> dict:
        """
        解析单条种子数据
        """
        self.torrents_info = {}
        try:
+            # 标题
            self.__get_title(torrent)
+            # 描述
            self.__get_description(torrent)
+            # 详情页面
            self.__get_detail(torrent)
+            # 下载链接
            self.__get_download(torrent)
+            # 完成数
            self.__get_grabs(torrent)
+            # 下载数
            self.__get_leechers(torrent)
+            # 做种数
            self.__get_seeders(torrent)
+            # 大小
            self.__get_size(torrent)
+            # IMDBID
            self.__get_imdbid(torrent)
+            # 下载系数
            self.__get_downloadvolumefactor(torrent)
+            # 上传系数
            self.__get_uploadvolumefactor(torrent)
+            # 发布时间
            self.__get_pubdate(torrent)
+            # 已发布时间
            self.__get_date_elapsed(torrent)
+            # 免费载止时间
            self.__get_free_date(torrent)
+            # 标签
            self.__get_labels(torrent)
+            # HR
            self.__get_hit_and_run(torrent)
+            # 分类
+            self.__get_category(torrent)
+
        except Exception as err:
            logger.error("%s 搜索出现错误：%s" % (self.indexername, str(err)))
        return self.torrents_info
@@ -632,6 +675,11 @@ class TorrentSpider:
                    text = text.strip()
                elif method_name == "appendleft":
                    text = f"{args}{text}"
+                elif method_name == "querystring":
+                    parsed_url = urlparse(text)
+                    query_params = parse_qs(parsed_url.query)
+                    param_value = query_params.get(args)
+                    text = param_value[0] if param_value else ''
            except Exception as err:
                logger.debug(f'过滤器 {method_name} 处理失败：{str(err)} - {traceback.format_exc()}')
        return text.strip()