feat:使用站点种子归类优化识别匹配

fix:优先级规则复杂时,过滤时间很长,调整到最后
fix #1432
This commit is contained in:
jxxghp 2024-02-29 17:18:01 +08:00
parent 2e661f8759
commit 5dd36e95e0
5 changed files with 127 additions and 54 deletions

View File

@ -135,28 +135,8 @@ class SearchChain(ChainBase):
if not torrents:
logger.warn(f'{keyword or mediainfo.title} 未搜索到资源')
return []
# 过滤种子
if priority_rule is None:
# 取搜索优先级规则
priority_rule = self.systemconfig.get(SystemConfigKey.SearchFilterRules)
if priority_rule:
logger.info(f'开始过滤资源,当前规则:{priority_rule} ...')
result: List[TorrentInfo] = self.filter_torrents(rule_string=priority_rule,
torrent_list=torrents,
season_episodes=season_episodes,
mediainfo=mediainfo)
if result is not None:
torrents = result
if not torrents:
logger.warn(f'{keyword or mediainfo.title} 没有符合优先级规则的资源')
return []
# 使用过滤规则再次过滤
torrents = self.filter_torrents_by_rule(torrents=torrents,
mediainfo=mediainfo,
filter_rule=filter_rule)
if not torrents:
logger.warn(f'{keyword or mediainfo.title} 没有符合过滤规则的资源')
return []
# 开始新进度
self.progress.start(ProgressKey.Search)
# 匹配的资源
_match_torrents = []
# 总数
@ -164,29 +144,32 @@ class SearchChain(ChainBase):
# 已处理数
_count = 0
if mediainfo:
self.progress.start(ProgressKey.Search)
logger.info(f'开始匹配,总 {_total} 个资源 ...')
# 英文标题应该在别名/原标题中,不需要再匹配
logger.info(f"标题:{mediainfo.title},原标题:{mediainfo.original_title},别名:{mediainfo.names}")
self.progress.update(value=0, text=f'开始匹配,总 {_total} 个资源 ...', key=ProgressKey.Search)
for torrent in torrents:
_count += 1
self.progress.update(value=(_count / _total) * 100,
self.progress.update(value=(_count / _total) * 96,
text=f'正在匹配 {torrent.site_name},已完成 {_count} / {_total} ...',
key=ProgressKey.Search)
# 比对IMDBID
if torrent.imdbid \
and mediainfo.imdb_id \
and torrent.imdbid == mediainfo.imdb_id:
logger.info(f'{mediainfo.title} 匹配到资源:{torrent.site_name} - {torrent.title}')
logger.info(f'{mediainfo.title} 通过IMDBID匹配到资源:{torrent.site_name} - {torrent.title}')
_match_torrents.append(torrent)
continue
# 识别
torrent_meta = MetaInfo(title=torrent.title, subtitle=torrent.description)
# 比对类型
if (torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV) \
or (torrent_meta.type != MediaType.TV and mediainfo.type == MediaType.TV):
logger.warn(f'{torrent.site_name} - {torrent.title} 类型不匹配')
# 比对种子识别类型
if torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV:
logger.warn(f'{torrent.site_name} - {torrent.title} 种子标题类型为 {torrent_meta.type.value}'
f'需要是 {mediainfo.type.value},不匹配')
continue
# 比对种子在站点中的类型
if torrent.category == MediaType.TV.value and mediainfo.type != MediaType.TV:
logger.warn(f'{torrent.site_name} - {torrent.title} 种子在站点中归类为 {torrent.category}'
f'需要是 {mediainfo.type.value},不匹配')
continue
# 比对年份
if mediainfo.year:
@ -231,21 +214,55 @@ class SearchChain(ChainBase):
break
else:
logger.warn(f'{torrent.site_name} - {torrent.title} 标题不匹配')
self.progress.update(value=100,
logger.info(f"匹配完成,共匹配到 {len(_match_torrents)} 个资源")
self.progress.update(value=97,
text=f'匹配完成,共匹配到 {len(_match_torrents)} 个资源',
key=ProgressKey.Search)
self.progress.end(ProgressKey.Search)
else:
_match_torrents = torrents
logger.info(f"匹配完成,共匹配到 {len(_match_torrents)} 个资源")
# 开始过滤
self.progress.update(value=98, text=f'开始过滤,总 {len(_match_torrents)} 个资源,请稍候...',
key=ProgressKey.Search)
# 过滤种子
if priority_rule is None:
# 取搜索优先级规则
priority_rule = self.systemconfig.get(SystemConfigKey.SearchFilterRules)
if priority_rule:
logger.info(f'开始优先级规则过滤,当前规则:{priority_rule} ...')
result: List[TorrentInfo] = self.filter_torrents(rule_string=priority_rule,
torrent_list=_match_torrents,
season_episodes=season_episodes,
mediainfo=mediainfo)
if result is not None:
_match_torrents = result
if not _match_torrents:
logger.warn(f'{keyword or mediainfo.title} 没有符合优先级规则的资源')
return []
# 使用过滤规则再次过滤
if filter_rule:
logger.info(f'开始过滤规则过滤,当前规则:{filter_rule} ...')
_match_torrents = self.filter_torrents_by_rule(torrents=_match_torrents,
mediainfo=mediainfo,
filter_rule=filter_rule)
if not _match_torrents:
logger.warn(f'{keyword or mediainfo.title} 没有符合过滤规则的资源')
return []
# 去掉mediainfo中多余的数据
mediainfo.clear()
# 组装上下文
contexts = [Context(meta_info=MetaInfo(title=torrent.title, subtitle=torrent.description),
media_info=mediainfo,
torrent_info=torrent) for torrent in _match_torrents]
logger.info(f"过滤完成,剩余 {_total} 个资源")
self.progress.update(value=99, text=f'过滤完成,剩余 {_total} 个资源', key=ProgressKey.Search)
# 排序
self.progress.update(value=100,
text=f'正在对 {len(contexts)} 个资源进行排序,请稍候...',
key=ProgressKey.Search)
contexts = self.torrenthelper.sort_torrents(contexts)
# 结束进度
self.progress.end(ProgressKey.Search)
# 返回
return contexts

View File

@ -566,12 +566,14 @@ class SubscribeChain(ChainBase):
if torrent_mediainfo.douban_id \
and torrent_mediainfo.douban_id != mediainfo.douban_id:
continue
logger.info(f'{mediainfo.title_year} 通过媒体信匹配到资源:{torrent_info.site_name} - {torrent_info.title}')
logger.info(f'{mediainfo.title_year} 通过媒体信ID匹配到资源:{torrent_info.site_name} - {torrent_info.title}')
else:
# 按标题匹配
# 比对类型
if (torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV) \
or (torrent_meta.type != MediaType.TV and mediainfo.type == MediaType.TV):
# 比对种子识别类型
if torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV:
continue
# 比对种子在站点中的类型
if torrent_info.category == MediaType.TV.value and mediainfo.type != MediaType.TV:
continue
# 比对年份
if mediainfo.year:

View File

@ -16,7 +16,7 @@ from app.helper.sites import SitesHelper
from app.helper.torrent import TorrentHelper
from app.log import logger
from app.schemas import Notification
from app.schemas.types import SystemConfigKey, MessageChannel, NotificationType
from app.schemas.types import SystemConfigKey, MessageChannel, NotificationType, MediaType
from app.utils.singleton import Singleton
from app.utils.string import StringUtils
@ -184,6 +184,10 @@ class TorrentsChain(ChainBase, metaclass=Singleton):
logger.info(f'处理资源:{torrent.title} ...')
# 识别
meta = MetaInfo(title=torrent.title, subtitle=torrent.description)
# 使用站点种子分类,校正类型识别
if meta.type != MediaType.TV \
and torrent.category == MediaType.TV.value:
meta.type = MediaType.TV
# 识别媒体信息
mediainfo: MediaInfo = self.mediachain.recognize_by_meta(meta)
if not mediainfo:

View File

@ -57,6 +57,8 @@ class TorrentInfo:
labels: list = field(default_factory=list)
# 种子优先级
pri_order: int = 0
# 种子分类 电影/电视剧
category: str = None
def __setattr__(self, name: str, value: Any):
self.__dict__[name] = value

View File

@ -3,7 +3,7 @@ import datetime
import re
import traceback
from typing import List
from urllib.parse import quote, urlencode
from urllib.parse import quote, urlencode, urlparse, parse_qs
import chardet
from jinja2 import Template
@ -276,7 +276,7 @@ class TorrentSpider:
return self.parse(page_source)
def __get_title(self, torrent):
# title default
# title default text
if 'title' not in self.fields:
return
selector = self.fields.get('title', {})
@ -306,7 +306,7 @@ class TorrentSpider:
selector.get('filters'))
def __get_description(self, torrent):
# title optional
# title optional text
if 'description' not in self.fields:
return
selector = self.fields.get('description', {})
@ -352,7 +352,7 @@ class TorrentSpider:
selector.get('filters'))
def __get_detail(self, torrent):
# details
# details page text
if 'details' not in self.fields:
return
selector = self.fields.get('details', {})
@ -373,7 +373,7 @@ class TorrentSpider:
self.torrents_info['page_url'] = detail_link
def __get_download(self, torrent):
# download link
# download link text
if 'download' not in self.fields:
return
selector = self.fields.get('download', {})
@ -403,7 +403,7 @@ class TorrentSpider:
selector.get('filters'))
def __get_size(self, torrent):
# torrent size
# torrent size int
if 'size' not in self.fields:
return
selector = self.fields.get('size', {})
@ -420,7 +420,7 @@ class TorrentSpider:
self.torrents_info['size'] = 0
def __get_leechers(self, torrent):
# torrent leechers
# torrent leechers int
if 'leechers' not in self.fields:
return
selector = self.fields.get('leechers', {})
@ -438,7 +438,7 @@ class TorrentSpider:
self.torrents_info['peers'] = 0
def __get_seeders(self, torrent):
# torrent leechers
# torrent leechers int
if 'seeders' not in self.fields:
return
selector = self.fields.get('seeders', {})
@ -456,7 +456,7 @@ class TorrentSpider:
self.torrents_info['seeders'] = 0
def __get_grabs(self, torrent):
# torrent grabs
# torrent grabs int
if 'grabs' not in self.fields:
return
selector = self.fields.get('grabs', {})
@ -474,7 +474,7 @@ class TorrentSpider:
self.torrents_info['grabs'] = 0
def __get_pubdate(self, torrent):
# torrent pubdate
# torrent pubdate yyyy-mm-dd hh:mm:ss
if 'date_added' not in self.fields:
return
selector = self.fields.get('date_added', {})
@ -486,7 +486,7 @@ class TorrentSpider:
selector.get('filters'))
def __get_date_elapsed(self, torrent):
# torrent pubdate
# torrent data elaspsed text
if 'date_elapsed' not in self.fields:
return
selector = self.fields.get('date_elapsed', {})
@ -498,7 +498,7 @@ class TorrentSpider:
selector.get('filters'))
def __get_downloadvolumefactor(self, torrent):
# downloadvolumefactor
# downloadvolumefactor int
selector = self.fields.get('downloadvolumefactor', {})
if not selector:
return
@ -521,7 +521,7 @@ class TorrentSpider:
self.torrents_info['downloadvolumefactor'] = int(downloadvolumefactor.group(1))
def __get_uploadvolumefactor(self, torrent):
# uploadvolumefactor
# uploadvolumefactor int
selector = self.fields.get('uploadvolumefactor', {})
if not selector:
return
@ -544,7 +544,7 @@ class TorrentSpider:
self.torrents_info['uploadvolumefactor'] = int(uploadvolumefactor.group(1))
def __get_labels(self, torrent):
# labels
# labels ['label1', 'label2']
if 'labels' not in self.fields:
return
selector = self.fields.get('labels', {})
@ -557,7 +557,7 @@ class TorrentSpider:
self.torrents_info['labels'] = []
def __get_free_date(self, torrent):
# free date
# free date yyyy-mm-dd hh:mm:ss
if 'freedate' not in self.fields:
return
selector = self.fields.get('freedate', {})
@ -569,7 +569,7 @@ class TorrentSpider:
selector.get('filters'))
def __get_hit_and_run(self, torrent):
# hitandrun
# hitandrun True/False
if 'hr' not in self.fields:
return
selector = self.fields.get('hr', {})
@ -579,28 +579,71 @@ class TorrentSpider:
else:
self.torrents_info['hit_and_run'] = False
def __get_category(self, torrent):
# category 电影/电视剧
if 'category' not in self.fields:
return
selector = self.fields.get('category', {})
category = torrent(selector.get('selector', '')).clone()
self.__remove(category, selector)
items = self.__attribute_or_text(category, selector)
category_value = self.__index(items, selector)
category_value = self.__filter_text(category_value,
selector.get('filters'))
if category_value and self.category:
tv_cats = [str(cat.get("id")) for cat in self.category.get("tv") or []]
movie_cats = [str(cat.get("id")) for cat in self.category.get("movie") or []]
if category_value in tv_cats \
and category_value not in movie_cats:
self.torrents_info['category'] = MediaType.TV.value
elif category_value in movie_cats:
self.torrents_info['category'] = MediaType.MOVIE.value
else:
self.torrents_info['category'] = MediaType.UNKNOWN.value
else:
self.torrents_info['category'] = MediaType.UNKNOWN.value
def get_info(self, torrent) -> dict:
"""
解析单条种子数据
"""
self.torrents_info = {}
try:
# 标题
self.__get_title(torrent)
# 描述
self.__get_description(torrent)
# 详情页面
self.__get_detail(torrent)
# 下载链接
self.__get_download(torrent)
# 完成数
self.__get_grabs(torrent)
# 下载数
self.__get_leechers(torrent)
# 做种数
self.__get_seeders(torrent)
# 大小
self.__get_size(torrent)
# IMDBID
self.__get_imdbid(torrent)
# 下载系数
self.__get_downloadvolumefactor(torrent)
# 上传系数
self.__get_uploadvolumefactor(torrent)
# 发布时间
self.__get_pubdate(torrent)
# 已发布时间
self.__get_date_elapsed(torrent)
# 免费载止时间
self.__get_free_date(torrent)
# 标签
self.__get_labels(torrent)
# HR
self.__get_hit_and_run(torrent)
# 分类
self.__get_category(torrent)
except Exception as err:
logger.error("%s 搜索出现错误:%s" % (self.indexername, str(err)))
return self.torrents_info
@ -632,6 +675,11 @@ class TorrentSpider:
text = text.strip()
elif method_name == "appendleft":
text = f"{args}{text}"
elif method_name == "querystring":
parsed_url = urlparse(text)
query_params = parse_qs(parsed_url.query)
param_value = query_params.get(args)
text = param_value[0] if param_value else ''
except Exception as err:
logger.debug(f'过滤器 {method_name} 处理失败:{str(err)} - {traceback.format_exc()}')
return text.strip()