feat:使用站点种子归类优化识别匹配

fix:优先级规则复杂时,过滤时间很长,调整到最后
fix #1432
This commit is contained in:
jxxghp 2024-02-29 17:18:01 +08:00
parent 2e661f8759
commit 5dd36e95e0
5 changed files with 127 additions and 54 deletions

View File

@ -135,28 +135,8 @@ class SearchChain(ChainBase):
if not torrents: if not torrents:
logger.warn(f'{keyword or mediainfo.title} 未搜索到资源') logger.warn(f'{keyword or mediainfo.title} 未搜索到资源')
return [] return []
# 过滤种子 # 开始新进度
if priority_rule is None: self.progress.start(ProgressKey.Search)
# 取搜索优先级规则
priority_rule = self.systemconfig.get(SystemConfigKey.SearchFilterRules)
if priority_rule:
logger.info(f'开始过滤资源,当前规则:{priority_rule} ...')
result: List[TorrentInfo] = self.filter_torrents(rule_string=priority_rule,
torrent_list=torrents,
season_episodes=season_episodes,
mediainfo=mediainfo)
if result is not None:
torrents = result
if not torrents:
logger.warn(f'{keyword or mediainfo.title} 没有符合优先级规则的资源')
return []
# 使用过滤规则再次过滤
torrents = self.filter_torrents_by_rule(torrents=torrents,
mediainfo=mediainfo,
filter_rule=filter_rule)
if not torrents:
logger.warn(f'{keyword or mediainfo.title} 没有符合过滤规则的资源')
return []
# 匹配的资源 # 匹配的资源
_match_torrents = [] _match_torrents = []
# 总数 # 总数
@ -164,29 +144,32 @@ class SearchChain(ChainBase):
# 已处理数 # 已处理数
_count = 0 _count = 0
if mediainfo: if mediainfo:
self.progress.start(ProgressKey.Search)
logger.info(f'开始匹配,总 {_total} 个资源 ...')
# 英文标题应该在别名/原标题中,不需要再匹配 # 英文标题应该在别名/原标题中,不需要再匹配
logger.info(f"标题:{mediainfo.title},原标题:{mediainfo.original_title},别名:{mediainfo.names}") logger.info(f"标题:{mediainfo.title},原标题:{mediainfo.original_title},别名:{mediainfo.names}")
self.progress.update(value=0, text=f'开始匹配,总 {_total} 个资源 ...', key=ProgressKey.Search) self.progress.update(value=0, text=f'开始匹配,总 {_total} 个资源 ...', key=ProgressKey.Search)
for torrent in torrents: for torrent in torrents:
_count += 1 _count += 1
self.progress.update(value=(_count / _total) * 100, self.progress.update(value=(_count / _total) * 96,
text=f'正在匹配 {torrent.site_name},已完成 {_count} / {_total} ...', text=f'正在匹配 {torrent.site_name},已完成 {_count} / {_total} ...',
key=ProgressKey.Search) key=ProgressKey.Search)
# 比对IMDBID # 比对IMDBID
if torrent.imdbid \ if torrent.imdbid \
and mediainfo.imdb_id \ and mediainfo.imdb_id \
and torrent.imdbid == mediainfo.imdb_id: and torrent.imdbid == mediainfo.imdb_id:
logger.info(f'{mediainfo.title} 匹配到资源:{torrent.site_name} - {torrent.title}') logger.info(f'{mediainfo.title} 通过IMDBID匹配到资源:{torrent.site_name} - {torrent.title}')
_match_torrents.append(torrent) _match_torrents.append(torrent)
continue continue
# 识别 # 识别
torrent_meta = MetaInfo(title=torrent.title, subtitle=torrent.description) torrent_meta = MetaInfo(title=torrent.title, subtitle=torrent.description)
# 比对类型 # 比对种子识别类型
if (torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV) \ if torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV:
or (torrent_meta.type != MediaType.TV and mediainfo.type == MediaType.TV): logger.warn(f'{torrent.site_name} - {torrent.title} 种子标题类型为 {torrent_meta.type.value}'
logger.warn(f'{torrent.site_name} - {torrent.title} 类型不匹配') f'需要是 {mediainfo.type.value},不匹配')
continue
# 比对种子在站点中的类型
if torrent.category == MediaType.TV.value and mediainfo.type != MediaType.TV:
logger.warn(f'{torrent.site_name} - {torrent.title} 种子在站点中归类为 {torrent.category}'
f'需要是 {mediainfo.type.value},不匹配')
continue continue
# 比对年份 # 比对年份
if mediainfo.year: if mediainfo.year:
@ -231,21 +214,55 @@ class SearchChain(ChainBase):
break break
else: else:
logger.warn(f'{torrent.site_name} - {torrent.title} 标题不匹配') logger.warn(f'{torrent.site_name} - {torrent.title} 标题不匹配')
self.progress.update(value=100, logger.info(f"匹配完成,共匹配到 {len(_match_torrents)} 个资源")
self.progress.update(value=97,
text=f'匹配完成,共匹配到 {len(_match_torrents)} 个资源', text=f'匹配完成,共匹配到 {len(_match_torrents)} 个资源',
key=ProgressKey.Search) key=ProgressKey.Search)
self.progress.end(ProgressKey.Search)
else: else:
_match_torrents = torrents _match_torrents = torrents
logger.info(f"匹配完成,共匹配到 {len(_match_torrents)} 个资源") # 开始过滤
self.progress.update(value=98, text=f'开始过滤,总 {len(_match_torrents)} 个资源,请稍候...',
key=ProgressKey.Search)
# 过滤种子
if priority_rule is None:
# 取搜索优先级规则
priority_rule = self.systemconfig.get(SystemConfigKey.SearchFilterRules)
if priority_rule:
logger.info(f'开始优先级规则过滤,当前规则:{priority_rule} ...')
result: List[TorrentInfo] = self.filter_torrents(rule_string=priority_rule,
torrent_list=_match_torrents,
season_episodes=season_episodes,
mediainfo=mediainfo)
if result is not None:
_match_torrents = result
if not _match_torrents:
logger.warn(f'{keyword or mediainfo.title} 没有符合优先级规则的资源')
return []
# 使用过滤规则再次过滤
if filter_rule:
logger.info(f'开始过滤规则过滤,当前规则:{filter_rule} ...')
_match_torrents = self.filter_torrents_by_rule(torrents=_match_torrents,
mediainfo=mediainfo,
filter_rule=filter_rule)
if not _match_torrents:
logger.warn(f'{keyword or mediainfo.title} 没有符合过滤规则的资源')
return []
# 去掉mediainfo中多余的数据 # 去掉mediainfo中多余的数据
mediainfo.clear() mediainfo.clear()
# 组装上下文 # 组装上下文
contexts = [Context(meta_info=MetaInfo(title=torrent.title, subtitle=torrent.description), contexts = [Context(meta_info=MetaInfo(title=torrent.title, subtitle=torrent.description),
media_info=mediainfo, media_info=mediainfo,
torrent_info=torrent) for torrent in _match_torrents] torrent_info=torrent) for torrent in _match_torrents]
logger.info(f"过滤完成,剩余 {_total} 个资源")
self.progress.update(value=99, text=f'过滤完成,剩余 {_total} 个资源', key=ProgressKey.Search)
# 排序 # 排序
self.progress.update(value=100,
text=f'正在对 {len(contexts)} 个资源进行排序,请稍候...',
key=ProgressKey.Search)
contexts = self.torrenthelper.sort_torrents(contexts) contexts = self.torrenthelper.sort_torrents(contexts)
# 结束进度
self.progress.end(ProgressKey.Search)
# 返回 # 返回
return contexts return contexts

View File

@ -566,12 +566,14 @@ class SubscribeChain(ChainBase):
if torrent_mediainfo.douban_id \ if torrent_mediainfo.douban_id \
and torrent_mediainfo.douban_id != mediainfo.douban_id: and torrent_mediainfo.douban_id != mediainfo.douban_id:
continue continue
logger.info(f'{mediainfo.title_year} 通过媒体信匹配到资源:{torrent_info.site_name} - {torrent_info.title}') logger.info(f'{mediainfo.title_year} 通过媒体信ID匹配到资源:{torrent_info.site_name} - {torrent_info.title}')
else: else:
# 按标题匹配 # 按标题匹配
# 比对类型 # 比对种子识别类型
if (torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV) \ if torrent_meta.type == MediaType.TV and mediainfo.type != MediaType.TV:
or (torrent_meta.type != MediaType.TV and mediainfo.type == MediaType.TV): continue
# 比对种子在站点中的类型
if torrent_info.category == MediaType.TV.value and mediainfo.type != MediaType.TV:
continue continue
# 比对年份 # 比对年份
if mediainfo.year: if mediainfo.year:

View File

@ -16,7 +16,7 @@ from app.helper.sites import SitesHelper
from app.helper.torrent import TorrentHelper from app.helper.torrent import TorrentHelper
from app.log import logger from app.log import logger
from app.schemas import Notification from app.schemas import Notification
from app.schemas.types import SystemConfigKey, MessageChannel, NotificationType from app.schemas.types import SystemConfigKey, MessageChannel, NotificationType, MediaType
from app.utils.singleton import Singleton from app.utils.singleton import Singleton
from app.utils.string import StringUtils from app.utils.string import StringUtils
@ -184,6 +184,10 @@ class TorrentsChain(ChainBase, metaclass=Singleton):
logger.info(f'处理资源:{torrent.title} ...') logger.info(f'处理资源:{torrent.title} ...')
# 识别 # 识别
meta = MetaInfo(title=torrent.title, subtitle=torrent.description) meta = MetaInfo(title=torrent.title, subtitle=torrent.description)
# 使用站点种子分类,校正类型识别
if meta.type != MediaType.TV \
and torrent.category == MediaType.TV.value:
meta.type = MediaType.TV
# 识别媒体信息 # 识别媒体信息
mediainfo: MediaInfo = self.mediachain.recognize_by_meta(meta) mediainfo: MediaInfo = self.mediachain.recognize_by_meta(meta)
if not mediainfo: if not mediainfo:

View File

@ -57,6 +57,8 @@ class TorrentInfo:
labels: list = field(default_factory=list) labels: list = field(default_factory=list)
# 种子优先级 # 种子优先级
pri_order: int = 0 pri_order: int = 0
# 种子分类 电影/电视剧
category: str = None
def __setattr__(self, name: str, value: Any): def __setattr__(self, name: str, value: Any):
self.__dict__[name] = value self.__dict__[name] = value

View File

@ -3,7 +3,7 @@ import datetime
import re import re
import traceback import traceback
from typing import List from typing import List
from urllib.parse import quote, urlencode from urllib.parse import quote, urlencode, urlparse, parse_qs
import chardet import chardet
from jinja2 import Template from jinja2 import Template
@ -276,7 +276,7 @@ class TorrentSpider:
return self.parse(page_source) return self.parse(page_source)
def __get_title(self, torrent): def __get_title(self, torrent):
# title default # title default text
if 'title' not in self.fields: if 'title' not in self.fields:
return return
selector = self.fields.get('title', {}) selector = self.fields.get('title', {})
@ -306,7 +306,7 @@ class TorrentSpider:
selector.get('filters')) selector.get('filters'))
def __get_description(self, torrent): def __get_description(self, torrent):
# title optional # title optional text
if 'description' not in self.fields: if 'description' not in self.fields:
return return
selector = self.fields.get('description', {}) selector = self.fields.get('description', {})
@ -352,7 +352,7 @@ class TorrentSpider:
selector.get('filters')) selector.get('filters'))
def __get_detail(self, torrent): def __get_detail(self, torrent):
# details # details page text
if 'details' not in self.fields: if 'details' not in self.fields:
return return
selector = self.fields.get('details', {}) selector = self.fields.get('details', {})
@ -373,7 +373,7 @@ class TorrentSpider:
self.torrents_info['page_url'] = detail_link self.torrents_info['page_url'] = detail_link
def __get_download(self, torrent): def __get_download(self, torrent):
# download link # download link text
if 'download' not in self.fields: if 'download' not in self.fields:
return return
selector = self.fields.get('download', {}) selector = self.fields.get('download', {})
@ -403,7 +403,7 @@ class TorrentSpider:
selector.get('filters')) selector.get('filters'))
def __get_size(self, torrent): def __get_size(self, torrent):
# torrent size # torrent size int
if 'size' not in self.fields: if 'size' not in self.fields:
return return
selector = self.fields.get('size', {}) selector = self.fields.get('size', {})
@ -420,7 +420,7 @@ class TorrentSpider:
self.torrents_info['size'] = 0 self.torrents_info['size'] = 0
def __get_leechers(self, torrent): def __get_leechers(self, torrent):
# torrent leechers # torrent leechers int
if 'leechers' not in self.fields: if 'leechers' not in self.fields:
return return
selector = self.fields.get('leechers', {}) selector = self.fields.get('leechers', {})
@ -438,7 +438,7 @@ class TorrentSpider:
self.torrents_info['peers'] = 0 self.torrents_info['peers'] = 0
def __get_seeders(self, torrent): def __get_seeders(self, torrent):
# torrent leechers # torrent leechers int
if 'seeders' not in self.fields: if 'seeders' not in self.fields:
return return
selector = self.fields.get('seeders', {}) selector = self.fields.get('seeders', {})
@ -456,7 +456,7 @@ class TorrentSpider:
self.torrents_info['seeders'] = 0 self.torrents_info['seeders'] = 0
def __get_grabs(self, torrent): def __get_grabs(self, torrent):
# torrent grabs # torrent grabs int
if 'grabs' not in self.fields: if 'grabs' not in self.fields:
return return
selector = self.fields.get('grabs', {}) selector = self.fields.get('grabs', {})
@ -474,7 +474,7 @@ class TorrentSpider:
self.torrents_info['grabs'] = 0 self.torrents_info['grabs'] = 0
def __get_pubdate(self, torrent): def __get_pubdate(self, torrent):
# torrent pubdate # torrent pubdate yyyy-mm-dd hh:mm:ss
if 'date_added' not in self.fields: if 'date_added' not in self.fields:
return return
selector = self.fields.get('date_added', {}) selector = self.fields.get('date_added', {})
@ -486,7 +486,7 @@ class TorrentSpider:
selector.get('filters')) selector.get('filters'))
def __get_date_elapsed(self, torrent): def __get_date_elapsed(self, torrent):
# torrent pubdate # torrent data elaspsed text
if 'date_elapsed' not in self.fields: if 'date_elapsed' not in self.fields:
return return
selector = self.fields.get('date_elapsed', {}) selector = self.fields.get('date_elapsed', {})
@ -498,7 +498,7 @@ class TorrentSpider:
selector.get('filters')) selector.get('filters'))
def __get_downloadvolumefactor(self, torrent): def __get_downloadvolumefactor(self, torrent):
# downloadvolumefactor # downloadvolumefactor int
selector = self.fields.get('downloadvolumefactor', {}) selector = self.fields.get('downloadvolumefactor', {})
if not selector: if not selector:
return return
@ -521,7 +521,7 @@ class TorrentSpider:
self.torrents_info['downloadvolumefactor'] = int(downloadvolumefactor.group(1)) self.torrents_info['downloadvolumefactor'] = int(downloadvolumefactor.group(1))
def __get_uploadvolumefactor(self, torrent): def __get_uploadvolumefactor(self, torrent):
# uploadvolumefactor # uploadvolumefactor int
selector = self.fields.get('uploadvolumefactor', {}) selector = self.fields.get('uploadvolumefactor', {})
if not selector: if not selector:
return return
@ -544,7 +544,7 @@ class TorrentSpider:
self.torrents_info['uploadvolumefactor'] = int(uploadvolumefactor.group(1)) self.torrents_info['uploadvolumefactor'] = int(uploadvolumefactor.group(1))
def __get_labels(self, torrent): def __get_labels(self, torrent):
# labels # labels ['label1', 'label2']
if 'labels' not in self.fields: if 'labels' not in self.fields:
return return
selector = self.fields.get('labels', {}) selector = self.fields.get('labels', {})
@ -557,7 +557,7 @@ class TorrentSpider:
self.torrents_info['labels'] = [] self.torrents_info['labels'] = []
def __get_free_date(self, torrent): def __get_free_date(self, torrent):
# free date # free date yyyy-mm-dd hh:mm:ss
if 'freedate' not in self.fields: if 'freedate' not in self.fields:
return return
selector = self.fields.get('freedate', {}) selector = self.fields.get('freedate', {})
@ -569,7 +569,7 @@ class TorrentSpider:
selector.get('filters')) selector.get('filters'))
def __get_hit_and_run(self, torrent): def __get_hit_and_run(self, torrent):
# hitandrun # hitandrun True/False
if 'hr' not in self.fields: if 'hr' not in self.fields:
return return
selector = self.fields.get('hr', {}) selector = self.fields.get('hr', {})
@ -579,28 +579,71 @@ class TorrentSpider:
else: else:
self.torrents_info['hit_and_run'] = False self.torrents_info['hit_and_run'] = False
def __get_category(self, torrent):
# category 电影/电视剧
if 'category' not in self.fields:
return
selector = self.fields.get('category', {})
category = torrent(selector.get('selector', '')).clone()
self.__remove(category, selector)
items = self.__attribute_or_text(category, selector)
category_value = self.__index(items, selector)
category_value = self.__filter_text(category_value,
selector.get('filters'))
if category_value and self.category:
tv_cats = [str(cat.get("id")) for cat in self.category.get("tv") or []]
movie_cats = [str(cat.get("id")) for cat in self.category.get("movie") or []]
if category_value in tv_cats \
and category_value not in movie_cats:
self.torrents_info['category'] = MediaType.TV.value
elif category_value in movie_cats:
self.torrents_info['category'] = MediaType.MOVIE.value
else:
self.torrents_info['category'] = MediaType.UNKNOWN.value
else:
self.torrents_info['category'] = MediaType.UNKNOWN.value
def get_info(self, torrent) -> dict: def get_info(self, torrent) -> dict:
""" """
解析单条种子数据 解析单条种子数据
""" """
self.torrents_info = {} self.torrents_info = {}
try: try:
# 标题
self.__get_title(torrent) self.__get_title(torrent)
# 描述
self.__get_description(torrent) self.__get_description(torrent)
# 详情页面
self.__get_detail(torrent) self.__get_detail(torrent)
# 下载链接
self.__get_download(torrent) self.__get_download(torrent)
# 完成数
self.__get_grabs(torrent) self.__get_grabs(torrent)
# 下载数
self.__get_leechers(torrent) self.__get_leechers(torrent)
# 做种数
self.__get_seeders(torrent) self.__get_seeders(torrent)
# 大小
self.__get_size(torrent) self.__get_size(torrent)
# IMDBID
self.__get_imdbid(torrent) self.__get_imdbid(torrent)
# 下载系数
self.__get_downloadvolumefactor(torrent) self.__get_downloadvolumefactor(torrent)
# 上传系数
self.__get_uploadvolumefactor(torrent) self.__get_uploadvolumefactor(torrent)
# 发布时间
self.__get_pubdate(torrent) self.__get_pubdate(torrent)
# 已发布时间
self.__get_date_elapsed(torrent) self.__get_date_elapsed(torrent)
# 免费载止时间
self.__get_free_date(torrent) self.__get_free_date(torrent)
# 标签
self.__get_labels(torrent) self.__get_labels(torrent)
# HR
self.__get_hit_and_run(torrent) self.__get_hit_and_run(torrent)
# 分类
self.__get_category(torrent)
except Exception as err: except Exception as err:
logger.error("%s 搜索出现错误:%s" % (self.indexername, str(err))) logger.error("%s 搜索出现错误:%s" % (self.indexername, str(err)))
return self.torrents_info return self.torrents_info
@ -632,6 +675,11 @@ class TorrentSpider:
text = text.strip() text = text.strip()
elif method_name == "appendleft": elif method_name == "appendleft":
text = f"{args}{text}" text = f"{args}{text}"
elif method_name == "querystring":
parsed_url = urlparse(text)
query_params = parse_qs(parsed_url.query)
param_value = query_params.get(args)
text = param_value[0] if param_value else ''
except Exception as err: except Exception as err:
logger.debug(f'过滤器 {method_name} 处理失败:{str(err)} - {traceback.format_exc()}') logger.debug(f'过滤器 {method_name} 处理失败:{str(err)} - {traceback.format_exc()}')
return text.strip() return text.strip()