This commit is contained in:
jxxghp
2023-06-06 07:15:17 +08:00
commit 4d06f86e62
217 changed files with 13959 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
from .metabase import MetaBase
from .metavideo import MetaVideo
from .metaanime import MetaAnime

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

218
app/core/meta/metaanime.py Normal file
View File

@@ -0,0 +1,218 @@
import re
import zhconv
import anitopy
from app.core.meta.metabase import MetaBase
from app.core.meta.release_groups import ReleaseGroupsMatcher
from app.utils.string import StringUtils
from app.utils.types import MediaType
class MetaAnime(MetaBase):
"""
识别动漫
"""
_anime_no_words = ['CHS&CHT', 'MP4', 'GB MP4', 'WEB-DL']
_name_nostring_re = r"S\d{2}\s*-\s*S\d{2}|S\d{2}|\s+S\d{1,2}|EP?\d{2,4}\s*-\s*EP?\d{2,4}|EP?\d{2,4}|\s+EP?\d{1,4}"
def __init__(self, title: str, subtitle: str = None, isfile: bool = False):
super().__init__(title, subtitle, isfile)
if not title:
return
# 调用第三方模块识别动漫
try:
original_title = title
# 字幕组信息会被预处理掉
anitopy_info_origin = anitopy.parse(title)
title = self.__prepare_title(title)
anitopy_info = anitopy.parse(title)
if anitopy_info:
# 名称
name = anitopy_info.get("anime_title")
if name and name.find("/") != -1:
name = name.split("/")[-1].strip()
if not name or name in self._anime_no_words or (len(name) < 5 and not StringUtils.is_chinese(name)):
anitopy_info = anitopy.parse("[ANIME]" + title)
if anitopy_info:
name = anitopy_info.get("anime_title")
if not name or name in self._anime_no_words or (len(name) < 5 and not StringUtils.is_chinese(name)):
name_match = re.search(r'\[(.+?)]', title)
if name_match and name_match.group(1):
name = name_match.group(1).strip()
# 拆份中英文名称
if name:
lastword_type = ""
for word in name.split():
if not word:
continue
if word.endswith(']'):
word = word[:-1]
if word.isdigit():
if lastword_type == "cn":
self.cn_name = "%s %s" % (self.cn_name or "", word)
elif lastword_type == "en":
self.en_name = "%s %s" % (self.en_name or "", word)
elif StringUtils.is_chinese(word):
self.cn_name = "%s %s" % (self.cn_name or "", word)
lastword_type = "cn"
else:
self.en_name = "%s %s" % (self.en_name or "", word)
lastword_type = "en"
if self.cn_name:
_, self.cn_name, _, _, _, _ = StringUtils.get_keyword(self.cn_name)
if self.cn_name:
self.cn_name = re.sub(r'%s' % self._name_nostring_re, '', self.cn_name, flags=re.IGNORECASE).strip()
self.cn_name = zhconv.convert(self.cn_name, "zh-hans")
if self.en_name:
self.en_name = re.sub(r'%s' % self._name_nostring_re, '', self.en_name, flags=re.IGNORECASE).strip().title()
self._name = StringUtils.str_title(self.en_name)
# 年份
year = anitopy_info.get("anime_year")
if str(year).isdigit():
self.year = str(year)
# 季号
anime_season = anitopy_info.get("anime_season")
if isinstance(anime_season, list):
if len(anime_season) == 1:
begin_season = anime_season[0]
end_season = None
else:
begin_season = anime_season[0]
end_season = anime_season[-1]
elif anime_season:
begin_season = anime_season
end_season = None
else:
begin_season = None
end_season = None
if begin_season:
self.begin_season = int(begin_season)
if end_season and int(end_season) != self.begin_season:
self.end_season = int(end_season)
self.total_seasons = (self.end_season - self.begin_season) + 1
else:
self.total_seasons = 1
self.type = MediaType.TV
# 集号
episode_number = anitopy_info.get("episode_number")
if isinstance(episode_number, list):
if len(episode_number) == 1:
begin_episode = episode_number[0]
end_episode = None
else:
begin_episode = episode_number[0]
end_episode = episode_number[-1]
elif episode_number:
begin_episode = episode_number
end_episode = None
else:
begin_episode = None
end_episode = None
if begin_episode:
try:
self.begin_episode = int(begin_episode)
if end_episode and int(end_episode) != self.begin_episode:
self.end_episode = int(end_episode)
self.total_episodes = (self.end_episode - self.begin_episode) + 1
else:
self.total_episodes = 1
except Exception as err:
print(str(err))
self.begin_episode = None
self.end_episode = None
self.type = MediaType.TV
# 类型
if not self.type:
anime_type = anitopy_info.get('anime_type')
if isinstance(anime_type, list):
anime_type = anime_type[0]
if anime_type and anime_type.upper() == "TV":
self.type = MediaType.TV
else:
self.type = MediaType.MOVIE
# 分辨率
self.resource_pix = anitopy_info.get("video_resolution")
if isinstance(self.resource_pix, list):
self.resource_pix = self.resource_pix[0]
if self.resource_pix:
if re.search(r'x', self.resource_pix, re.IGNORECASE):
self.resource_pix = re.split(r'[Xx]', self.resource_pix)[-1] + "p"
else:
self.resource_pix = self.resource_pix.lower()
if str(self.resource_pix).isdigit():
self.resource_pix = str(self.resource_pix) + "p"
# 制作组/字幕组
self.resource_team = \
ReleaseGroupsMatcher().match(title=original_title) or \
anitopy_info_origin.get("release_group") or None
# 视频编码
self.video_encode = anitopy_info.get("video_term")
if isinstance(self.video_encode, list):
self.video_encode = self.video_encode[0]
# 音频编码
self.audio_encode = anitopy_info.get("audio_term")
if isinstance(self.audio_encode, list):
self.audio_encode = self.audio_encode[0]
# 解析副标题,只要季和集
self.init_subtitle(self.org_string)
if not self._subtitle_flag and self.subtitle:
self.init_subtitle(self.subtitle)
if not self.type:
self.type = MediaType.TV
except Exception as e:
print(str(e))
@staticmethod
def __prepare_title(title: str):
"""
对命名进行预处理
"""
if not title:
return title
# 所有【】换成[]
title = title.replace("", "[").replace("", "]").strip()
# 截掉xx番剧漫
match = re.search(r"新番|月?番|[日美国][漫剧]", title)
if match and match.span()[1] < len(title) - 1:
title = re.sub(".*番.|.*[日美国][漫剧].", "", title)
elif match:
title = title[:title.rfind('[')]
# 截掉分类
first_item = title.split(']')[0]
if first_item and re.search(r"[动漫画纪录片电影视连续剧集日美韩中港台海外亚洲华语大陆综艺原盘高清]{2,}|TV|Animation|Movie|Documentar|Anime",
zhconv.convert(first_item, "zh-hans"),
re.IGNORECASE):
title = re.sub(r"^[^]]*]", "", title).strip()
# 去掉大小
title = re.sub(r'[0-9.]+\s*[MGT]i?B(?![A-Z]+)', "", title, flags=re.IGNORECASE)
# 将TVxx改为xx
title = re.sub(r"\[TV\s+(\d{1,4})", r"[\1", title, flags=re.IGNORECASE)
# 将4K转为2160p
title = re.sub(r'\[4k]', '2160p', title, flags=re.IGNORECASE)
# 处理/分隔的中英文标题
names = title.split("]")
if len(names) > 1 and title.find("- ") == -1:
titles = []
for name in names:
if not name:
continue
left_char = ''
if name.startswith('['):
left_char = '['
name = name[1:]
if name and name.find("/") != -1:
if name.split("/")[-1].strip():
titles.append("%s%s" % (left_char, name.split("/")[-1].strip()))
else:
titles.append("%s%s" % (left_char, name.split("/")[0].strip()))
elif name:
if StringUtils.is_chinese(name) and not StringUtils.is_all_chinese(name):
if not re.search(r"\[\d+", name, re.IGNORECASE):
name = re.sub(r'[\d|#:\-()\u4e00-\u9fff]', '', name).strip()
if not name or name.strip().isdigit():
continue
if name == '[':
titles.append("")
else:
titles.append("%s%s" % (left_char, name.strip()))
return "]".join(titles)
return title

427
app/core/meta/metabase.py Normal file
View File

@@ -0,0 +1,427 @@
from typing import Union, Optional
import cn2an
import regex as re
from app.utils.string import StringUtils
from app.utils.types import MediaType
class MetaBase(object):
"""
媒体信息基类
"""
# 是否处理的文件
isfile: bool = False
# 原字符串
org_string: Optional[str] = None
# 副标题
subtitle: Optional[str] = None
# 类型 电影、电视剧
type: Optional[MediaType] = None
# 识别的中文名
cn_name: Optional[str] = None
# 识别的英文名
en_name: Optional[str] = None
# 年份
year: Optional[str] = None
# 总季数
total_seasons: int = 0
# 识别的开始季 数字
begin_season: Optional[int] = None
# 识别的结束季 数字
end_season: Optional[int] = None
# 总集数
total_episodes: int = 0
# 识别的开始集
begin_episode: Optional[int] = None
# 识别的结束集
end_episode: Optional[int] = None
# Partx Cd Dvd Disk Disc
part: Optional[str] = None
# 识别的资源类型
resource_type: Optional[str] = None
# 识别的效果
resource_effect: Optional[str] = None
# 识别的分辨率
resource_pix: Optional[str] = None
# 识别的制作组/字幕组
resource_team: Optional[str] = None
# 视频编码
video_encode: Optional[str] = None
# 音频编码
audio_encode: Optional[str] = None
# 副标题解析
_subtitle_flag = False
_subtitle_season_re = r"(?<![全共]\s*)[第\s]+([0-9一二三四五六七八九十S\-]+)\s*季(?!\s*[全共])"
_subtitle_season_all_re = r"[全共]\s*([0-9一二三四五六七八九十]+)\s*季|([0-9一二三四五六七八九十]+)\s*季\s*全"
_subtitle_episode_re = r"(?<![全共]\s*)[第\s]+([0-9一二三四五六七八九十百零EP\-]+)\s*[集话話期](?!\s*[全共])"
_subtitle_episode_all_re = r"([0-9一二三四五六七八九十百零]+)\s*集\s*全|[全共]\s*([0-9一二三四五六七八九十百零]+)\s*[集话話期]"
def __init__(self, title: str, subtitle: str = None, isfile: bool = False):
if not title:
return
self.org_string = title
self.subtitle = subtitle
self.isfile = isfile
def get_name(self):
"""
返回名称
"""
if self.cn_name and StringUtils.is_all_chinese(self.cn_name):
return self.cn_name
elif self.en_name:
return self.en_name
elif self.cn_name:
return self.cn_name
return ""
def init_subtitle(self, title_text: str):
"""
副标题识别
"""
if not title_text:
return
title_text = f" {title_text} "
if re.search(r'[全第季集话話期]', title_text, re.IGNORECASE):
# 第x季
season_str = re.search(r'%s' % self._subtitle_season_re, title_text, re.IGNORECASE)
if season_str:
seasons = season_str.group(1)
if seasons:
seasons = seasons.upper().replace("S", "").strip()
else:
return
try:
end_season = None
if seasons.find('-') != -1:
seasons = seasons.split('-')
begin_season = int(cn2an.cn2an(seasons[0].strip(), mode='smart'))
if len(seasons) > 1:
end_season = int(cn2an.cn2an(seasons[1].strip(), mode='smart'))
else:
begin_season = int(cn2an.cn2an(seasons, mode='smart'))
except Exception as err:
print(str(err))
return
if self.begin_season is None and isinstance(begin_season, int):
self.begin_season = begin_season
self.total_seasons = 1
if self.begin_season is not None \
and self.end_season is None \
and isinstance(end_season, int) \
and end_season != self.begin_season:
self.end_season = end_season
self.total_seasons = (self.end_season - self.begin_season) + 1
self.type = MediaType.TV
self._subtitle_flag = True
# 第x集
episode_str = re.search(r'%s' % self._subtitle_episode_re, title_text, re.IGNORECASE)
if episode_str:
episodes = episode_str.group(1)
if episodes:
episodes = episodes.upper().replace("E", "").replace("P", "").strip()
else:
return
try:
end_episode = None
if episodes.find('-') != -1:
episodes = episodes.split('-')
begin_episode = int(cn2an.cn2an(episodes[0].strip(), mode='smart'))
if len(episodes) > 1:
end_episode = int(cn2an.cn2an(episodes[1].strip(), mode='smart'))
else:
begin_episode = int(cn2an.cn2an(episodes, mode='smart'))
except Exception as err:
print(str(err))
return
if self.begin_episode is None and isinstance(begin_episode, int):
self.begin_episode = begin_episode
self.total_episodes = 1
if self.begin_episode is not None \
and self.end_episode is None \
and isinstance(end_episode, int) \
and end_episode != self.begin_episode:
self.end_episode = end_episode
self.total_episodes = (self.end_episode - self.begin_episode) + 1
self.type = MediaType.TV
self._subtitle_flag = True
# x集全
episode_all_str = re.search(r'%s' % self._subtitle_episode_all_re, title_text, re.IGNORECASE)
if episode_all_str:
episode_all = episode_all_str.group(1)
if not episode_all:
episode_all = episode_all_str.group(2)
if episode_all and self.begin_episode is None:
try:
self.total_episodes = int(cn2an.cn2an(episode_all.strip(), mode='smart'))
except Exception as err:
print(str(err))
return
self.begin_episode = None
self.end_episode = None
self.type = MediaType.TV
self._subtitle_flag = True
# 全x季 x季全
season_all_str = re.search(r"%s" % self._subtitle_season_all_re, title_text, re.IGNORECASE)
if season_all_str:
season_all = season_all_str.group(1)
if not season_all:
season_all = season_all_str.group(2)
if season_all and self.begin_season is None and self.begin_episode is None:
try:
self.total_seasons = int(cn2an.cn2an(season_all.strip(), mode='smart'))
except Exception as err:
print(str(err))
return
self.begin_season = 1
self.end_season = self.total_seasons
self.type = MediaType.TV
self._subtitle_flag = True
def is_in_season(self, season: Union[list, int, str]):
"""
是否包含季
"""
if isinstance(season, list):
if self.end_season is not None:
meta_season = list(range(self.begin_season, self.end_season + 1))
else:
if self.begin_season is not None:
meta_season = [self.begin_season]
else:
meta_season = [1]
return set(meta_season).issuperset(set(season))
else:
if self.end_season is not None:
return self.begin_season <= int(season) <= self.end_season
else:
if self.begin_season is not None:
return int(season) == self.begin_season
else:
return int(season) == 1
def is_in_episode(self, episode: Union[list, int, str]):
"""
是否包含集
"""
if isinstance(episode, list):
if self.end_episode is not None:
meta_episode = list(range(self.begin_episode, self.end_episode + 1))
else:
meta_episode = [self.begin_episode]
return set(meta_episode).issuperset(set(episode))
else:
if self.end_episode is not None:
return self.begin_episode <= int(episode) <= self.end_episode
else:
return int(episode) == self.begin_episode
def get_season_string(self):
"""
返回季字符串
"""
if self.begin_season is not None:
return "S%s" % str(self.begin_season).rjust(2, "0") \
if self.end_season is None \
else "S%s-S%s" % \
(str(self.begin_season).rjust(2, "0"),
str(self.end_season).rjust(2, "0"))
else:
if self.type == MediaType.MOVIE:
return ""
else:
return "S01"
def get_season_item(self):
"""
返回begin_season 的Sxx
"""
if self.begin_season is not None:
return "S%s" % str(self.begin_season).rjust(2, "0")
else:
if self.type == MediaType.MOVIE:
return ""
else:
return "S01"
def get_season_seq(self):
"""
返回begin_season 的数字
"""
if self.begin_season is not None:
return str(self.begin_season)
else:
if self.type == MediaType.MOVIE:
return ""
else:
return "1"
def get_season_list(self):
"""
返回季的数组
"""
if self.begin_season is None:
if self.type == MediaType.MOVIE:
return []
else:
return [1]
elif self.end_season is not None:
return [season for season in range(self.begin_season, self.end_season + 1)]
else:
return [self.begin_season]
def set_season(self, sea: Union[list, int, str]):
"""
更新季
"""
if not sea:
return
if isinstance(sea, list):
if len(sea) == 1 and str(sea[0]).isdigit():
self.begin_season = int(sea[0])
self.end_season = None
elif len(sea) > 1 and str(sea[0]).isdigit() and str(sea[-1]).isdigit():
self.begin_season = int(sea[0])
self.end_season = int(sea[-1])
elif str(sea).isdigit():
self.begin_season = int(sea)
self.end_season = None
def set_episode(self, ep: Union[list, int, str]):
"""
更新集
"""
if not ep:
return
if isinstance(ep, list):
if len(ep) == 1 and str(ep[0]).isdigit():
self.begin_episode = int(ep[0])
self.end_episode = None
elif len(ep) > 1 and str(ep[0]).isdigit() and str(ep[-1]).isdigit():
self.begin_episode = int(ep[0])
self.end_episode = int(ep[-1])
elif str(ep).isdigit():
self.begin_episode = int(ep)
self.end_episode = None
#
def get_episode_string(self):
"""
返回集字符串
"""
if self.begin_episode is not None:
return "E%s" % str(self.begin_episode).rjust(2, "0") \
if self.end_episode is None \
else "E%s-E%s" % \
(
str(self.begin_episode).rjust(2, "0"),
str(self.end_episode).rjust(2, "0"))
else:
return ""
def get_episode_list(self):
"""
返回集的数组
"""
if self.begin_episode is None:
return []
elif self.end_episode is not None:
return [episode for episode in range(self.begin_episode, self.end_episode + 1)]
else:
return [self.begin_episode]
def get_episode_items(self):
"""
返回集的并列表达方式,用于支持单文件多集
"""
return "E%s" % "E".join(str(episode).rjust(2, '0') for episode in self.get_episode_list())
def get_episode_seqs(self):
"""
返回单文件多集的集数表达方式,用于支持单文件多集
"""
episodes = self.get_episode_list()
if episodes:
# 集 xx
if len(episodes) == 1:
return str(episodes[0])
else:
return "%s-%s" % (episodes[0], episodes[-1])
else:
return ""
def get_episode_seq(self):
"""
返回begin_episode 的数字
"""
episodes = self.get_episode_list()
if episodes:
return str(episodes[0])
else:
return ""
def get_season_episode_string(self):
"""
返回季集字符串
"""
if self.type == MediaType.MOVIE:
return ""
else:
seaion = self.get_season_string()
episode = self.get_episode_string()
if seaion and episode:
return "%s %s" % (seaion, episode)
elif seaion:
return "%s" % seaion
elif episode:
return "%s" % episode
return ""
def get_resource_type_string(self):
"""
返回资源类型字符串,含分辨率
"""
ret_string = ""
if self.resource_type:
ret_string = f"{ret_string} {self.resource_type}"
if self.resource_effect:
ret_string = f"{ret_string} {self.resource_effect}"
if self.resource_pix:
ret_string = f"{ret_string} {self.resource_pix}"
return ret_string
def get_edtion_string(self):
"""
返回资源类型字符串,不含分辨率
"""
ret_string = ""
if self.resource_type:
ret_string = f"{ret_string} {self.resource_type}"
if self.resource_effect:
ret_string = f"{ret_string} {self.resource_effect}"
return ret_string.strip()
def get_resource_team_string(self):
"""
返回发布组/字幕组字符串
"""
if self.resource_team:
return self.resource_team
else:
return ""
def get_video_encode_string(self):
"""
返回视频编码
"""
return self.video_encode or ""
def get_audio_encode_string(self):
"""
返回音频编码
"""
return self.audio_encode or ""

557
app/core/meta/metavideo.py Normal file
View File

@@ -0,0 +1,557 @@
import re
from pathlib import Path
from app.core.config import settings
from app.core.meta.metabase import MetaBase
from app.core.meta.release_groups import ReleaseGroupsMatcher
from app.utils.string import StringUtils
from app.utils.tokens import Tokens
from app.utils.types import MediaType
class MetaVideo(MetaBase):
"""
识别电影、电视剧
"""
# 控制标位区
_stop_name_flag = False
_stop_cnname_flag = False
_last_token = ""
_last_token_type = ""
_continue_flag = True
_unknown_name_str = ""
_source = ""
_effect = []
# 正则式区
_season_re = r"S(\d{2})|^S(\d{1,2})$|S(\d{1,2})E"
_episode_re = r"EP?(\d{2,4})$|^EP?(\d{1,4})$|^S\d{1,2}EP?(\d{1,4})$|S\d{2}EP?(\d{2,4})"
_part_re = r"(^PART[0-9ABI]{0,2}$|^CD[0-9]{0,2}$|^DVD[0-9]{0,2}$|^DISK[0-9]{0,2}$|^DISC[0-9]{0,2}$)"
_roman_numerals = r"^(?=[MDCLXVI])M*(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})$"
_source_re = r"^BLURAY$|^HDTV$|^UHDTV$|^HDDVD$|^WEBRIP$|^DVDRIP$|^BDRIP$|^BLU$|^WEB$|^BD$|^HDRip$"
_effect_re = r"^REMUX$|^UHD$|^SDR$|^HDR\d*$|^DOLBY$|^DOVI$|^DV$|^3D$|^REPACK$"
_resources_type_re = r"%s|%s" % (_source_re, _effect_re)
_name_no_begin_re = r"^\[.+?]"
_name_no_chinese_re = r".*版|.*字幕"
_name_se_words = ['', '', '', '', '', '', '']
_name_nostring_re = r"^PTS|^JADE|^AOD|^CHC|^[A-Z]{1,4}TV[\-0-9UVHDK]*" \
r"|HBO$|\s+HBO|\d{1,2}th|\d{1,2}bit|NETFLIX|AMAZON|IMAX|^3D|\s+3D|^BBC\s+|\s+BBC|BBC$|DISNEY\+?|XXX|\s+DC$" \
r"|[第\s共]+[0-9一二三四五六七八九十\-\s]+季" \
r"|[第\s共]+[0-9一二三四五六七八九十百零\-\s]+[集话話]" \
r"|连载|日剧|美剧|电视剧|动画片|动漫|欧美|西德|日韩|超高清|高清|蓝光|翡翠台|梦幻天堂·龙网|★?\d*月?新番" \
r"|最终季|合集|[多中国英葡法俄日韩德意西印泰台港粤双文语简繁体特效内封官译外挂]+字幕|版本|出品|台版|港版|\w+字幕组" \
r"|未删减版|UNCUT$|UNRATE$|WITH EXTRAS$|RERIP$|SUBBED$|PROPER$|REPACK$|SEASON$|EPISODE$|Complete$|Extended$|Extended Version$" \
r"|S\d{2}\s*-\s*S\d{2}|S\d{2}|\s+S\d{1,2}|EP?\d{2,4}\s*-\s*EP?\d{2,4}|EP?\d{2,4}|\s+EP?\d{1,4}" \
r"|CD[\s.]*[1-9]|DVD[\s.]*[1-9]|DISK[\s.]*[1-9]|DISC[\s.]*[1-9]" \
r"|[248]K|\d{3,4}[PIX]+" \
r"|CD[\s.]*[1-9]|DVD[\s.]*[1-9]|DISK[\s.]*[1-9]|DISC[\s.]*[1-9]"
_resources_pix_re = r"^[SBUHD]*(\d{3,4}[PI]+)|\d{3,4}X(\d{3,4})"
_resources_pix_re2 = r"(^[248]+K)"
_video_encode_re = r"^[HX]26[45]$|^AVC$|^HEVC$|^VC\d?$|^MPEG\d?$|^Xvid$|^DivX$|^HDR\d*$"
_audio_encode_re = r"^DTS\d?$|^DTSHD$|^DTSHDMA$|^Atmos$|^TrueHD\d?$|^AC3$|^\dAudios?$|^DDP\d?$|^DD\d?$|^LPCM\d?$|^AAC\d?$|^FLAC\d?$|^HD\d?$|^MA\d?$"
def __init__(self, title: str, subtitle: str = None, isfile: bool = False):
super().__init__(title, subtitle, isfile)
if not title:
return
original_title = title
self._source = ""
self._effect = []
# 判断是否纯数字命名
title_path = Path(title)
if title_path.suffix.lower() in settings.RMT_MEDIAEXT \
and title_path.stem.isdigit() \
and len(title_path.stem) < 5:
self.begin_episode = int(title_path.stem)
self.type = MediaType.TV
return
# 去掉名称中第1个[]的内容
title = re.sub(r'%s' % self._name_no_begin_re, "", title, count=1)
# 把xxxx-xxxx年份换成前一个年份常出现在季集上
title = re.sub(r'([\s.]+)(\d{4})-(\d{4})', r'\1\2', title)
# 把大小去掉
title = re.sub(r'[0-9.]+\s*[MGT]i?B(?![A-Z]+)', "", title, flags=re.IGNORECASE)
# 把年月日去掉
title = re.sub(r'\d{4}[\s._-]\d{1,2}[\s._-]\d{1,2}', "", title)
# 拆分tokens
tokens = Tokens(title)
self.tokens = tokens
# 解析名称、年份、季、集、资源类型、分辨率等
token = tokens.get_next()
while token:
# Part
self.__init_part(token)
# 标题
if self._continue_flag:
self.__init_name(token)
# 年份
if self._continue_flag:
self.__init_year(token)
# 分辨率
if self._continue_flag:
self.__init_resource_pix(token)
# 季
if self._continue_flag:
self.__init_season(token)
# 集
if self._continue_flag:
self.__init_episode(token)
# 资源类型
if self._continue_flag:
self.__init_resource_type(token)
# 视频编码
if self._continue_flag:
self.__init_video_encode(token)
# 音频编码
if self._continue_flag:
self.__init_audio_encode(token)
# 取下一个,直到没有为卡
token = tokens.get_next()
self._continue_flag = True
# 合成质量
if self._effect:
self._effect.reverse()
self.resource_effect = " ".join(self._effect)
if self._source:
self.resource_type = self._source.strip()
# 提取原盘DIY
if self.resource_type and "BluRay" in self.resource_type:
if (self.subtitle and re.findall(r'D[Ii]Y', self.subtitle)) \
or re.findall(r'-D[Ii]Y@', original_title):
self.resource_type = f"{self.resource_type} DIY"
# 解析副标题,只要季和集
self.init_subtitle(self.org_string)
if not self._subtitle_flag and self.subtitle:
self.init_subtitle(self.subtitle)
# 没有识别出类型时默认为电影
if not self.type:
self.type = MediaType.MOVIE
# 去掉名字中不需要的干扰字符,过短的纯数字不要
self.cn_name = self.__fix_name(self.cn_name)
self.en_name = StringUtils.str_title(self.__fix_name(self.en_name))
# 处理part
if self.part and self.part.upper() == "PART":
self.part = None
# 制作组/字幕组
self.resource_team = ReleaseGroupsMatcher().match(title=original_title) or None
def __fix_name(self, name: str):
if not name:
return name
name = re.sub(r'%s' % self._name_nostring_re, '', name,
flags=re.IGNORECASE).strip()
name = re.sub(r'\s+', ' ', name)
if name.isdigit() \
and int(name) < 1800 \
and not self.year \
and not self.begin_season \
and not self.resource_pix \
and not self.resource_type \
and not self.audio_encode \
and not self.video_encode:
if self.begin_episode is None:
self.begin_episode = int(name)
name = None
elif self.is_in_episode(int(name)) and not self.begin_season:
name = None
return name
def __init_name(self, token: str):
if not token:
return
# 回收标题
if self._unknown_name_str:
if not self.cn_name:
if not self.en_name:
self.en_name = self._unknown_name_str
elif self._unknown_name_str != self.year:
self.en_name = "%s %s" % (self.en_name, self._unknown_name_str)
self._last_token_type = "enname"
self._unknown_name_str = ""
if self._stop_name_flag:
return
if token.upper() == "AKA":
self._continue_flag = False
self._stop_name_flag = True
return
if token in self._name_se_words:
self._last_token_type = 'name_se_words'
return
if StringUtils.is_chinese(token):
# 含有中文,直接做为标题(连着的数字或者英文会保留),且不再取用后面出现的中文
self._last_token_type = "cnname"
if not self.cn_name:
self.cn_name = token
elif not self._stop_cnname_flag:
if not re.search("%s" % self._name_no_chinese_re, token, flags=re.IGNORECASE) \
and not re.search("%s" % self._name_se_words, token, flags=re.IGNORECASE):
self.cn_name = "%s %s" % (self.cn_name, token)
self._stop_cnname_flag = True
else:
is_roman_digit = re.search(self._roman_numerals, token)
# 阿拉伯数字或者罗马数字
if token.isdigit() or is_roman_digit:
# 第季集后面的不要
if self._last_token_type == 'name_se_words':
return
if self.get_name():
# 名字后面以 0 开头的不要,极有可能是集
if token.startswith('0'):
return
# 检查是否真正的数字
if token.isdigit():
try:
int(token)
except ValueError:
return
# 中文名后面跟的数字不是年份的极有可能是集
if not is_roman_digit \
and self._last_token_type == "cnname" \
and int(token) < 1900:
return
if (token.isdigit() and len(token) < 4) or is_roman_digit:
# 4位以下的数字或者罗马数字拼装到已有标题中
if self._last_token_type == "cnname":
self.cn_name = "%s %s" % (self.cn_name, token)
elif self._last_token_type == "enname":
self.en_name = "%s %s" % (self.en_name, token)
self._continue_flag = False
elif token.isdigit() and len(token) == 4:
# 4位数字可能是年份也可能真的是标题的一部分也有可能是集
if not self._unknown_name_str:
self._unknown_name_str = token
else:
# 名字未出现前的第一个数字,记下来
if not self._unknown_name_str:
self._unknown_name_str = token
elif re.search(r"%s" % self._season_re, token, re.IGNORECASE):
# 季的处理
if self.en_name and re.search(r"SEASON$", self.en_name, re.IGNORECASE):
# 如果匹配到季英文名结尾为Season说明Season属于标题不应在后续作为干扰词去除
self.en_name += ' '
self._stop_name_flag = True
return
elif re.search(r"%s" % self._episode_re, token, re.IGNORECASE) \
or re.search(r"(%s)" % self._resources_type_re, token, re.IGNORECASE) \
or re.search(r"%s" % self._resources_pix_re, token, re.IGNORECASE):
# 集、来源、版本等不要
self._stop_name_flag = True
return
else:
# 后缀名不要
if ".%s".lower() % token in settings.RMT_MEDIAEXT:
return
# 英文或者英文+数字,拼装起来
if self.en_name:
self.en_name = "%s %s" % (self.en_name, token)
else:
self.en_name = token
self._last_token_type = "enname"
def __init_part(self, token: str):
if not self.get_name():
return
if not self.year \
and not self.begin_season \
and not self.begin_episode \
and not self.resource_pix \
and not self.resource_type:
return
re_res = re.search(r"%s" % self._part_re, token, re.IGNORECASE)
if re_res:
if not self.part:
self.part = re_res.group(1)
nextv = self.tokens.cur()
if nextv \
and ((nextv.isdigit() and (len(nextv) == 1 or len(nextv) == 2 and nextv.startswith('0')))
or nextv.upper() in ['A', 'B', 'C', 'I', 'II', 'III']):
self.part = "%s%s" % (self.part, nextv)
self.tokens.get_next()
self._last_token_type = "part"
self._continue_flag = False
self._stop_name_flag = False
def __init_year(self, token: str):
if not self.get_name():
return
if not token.isdigit():
return
if len(token) != 4:
return
if not 1900 < int(token) < 2050:
return
if self.year:
if self.en_name:
self.en_name = "%s %s" % (self.en_name.strip(), self.year)
elif self.cn_name:
self.cn_name = "%s %s" % (self.cn_name, self.year)
elif self.en_name and re.search(r"SEASON$", self.en_name, re.IGNORECASE):
# 如果匹配到年且英文名结尾为Season说明Season属于标题不应在后续作为干扰词去除
self.en_name += ' '
self.year = token
self._last_token_type = "year"
self._continue_flag = False
self._stop_name_flag = True
def __init_resource_pix(self, token: str):
if not self.get_name():
return
re_res = re.findall(r"%s" % self._resources_pix_re, token, re.IGNORECASE)
if re_res:
self._last_token_type = "pix"
self._continue_flag = False
self._stop_name_flag = True
resource_pix = None
for pixs in re_res:
if isinstance(pixs, tuple):
pix_t = None
for pix_i in pixs:
if pix_i:
pix_t = pix_i
break
if pix_t:
resource_pix = pix_t
else:
resource_pix = pixs
if resource_pix and not self.resource_pix:
self.resource_pix = resource_pix.lower()
break
if self.resource_pix \
and self.resource_pix.isdigit() \
and self.resource_pix[-1] not in 'kpi':
self.resource_pix = "%sp" % self.resource_pix
else:
re_res = re.search(r"%s" % self._resources_pix_re2, token, re.IGNORECASE)
if re_res:
self._last_token_type = "pix"
self._continue_flag = False
self._stop_name_flag = True
if not self.resource_pix:
self.resource_pix = re_res.group(1).lower()
def __init_season(self, token: str):
re_res = re.findall(r"%s" % self._season_re, token, re.IGNORECASE)
if re_res:
self._last_token_type = "season"
self.type = MediaType.TV
self._stop_name_flag = True
self._continue_flag = True
for se in re_res:
if isinstance(se, tuple):
se_t = None
for se_i in se:
if se_i and str(se_i).isdigit():
se_t = se_i
break
if se_t:
se = int(se_t)
else:
break
else:
se = int(se)
if self.begin_season is None:
self.begin_season = se
self.total_seasons = 1
else:
if se > self.begin_season:
self.end_season = se
self.total_seasons = (self.end_season - self.begin_season) + 1
if self.isfile and self.total_seasons > 1:
self.end_season = None
self.total_seasons = 1
elif token.isdigit():
try:
int(token)
except ValueError:
return
if self._last_token_type == "SEASON" \
and self.begin_season is None \
and len(token) < 3:
self.begin_season = int(token)
self.total_seasons = 1
self._last_token_type = "season"
self._stop_name_flag = True
self._continue_flag = False
self.type = MediaType.TV
elif token.upper() == "SEASON" and self.begin_season is None:
self._last_token_type = "SEASON"
def __init_episode(self, token: str):
re_res = re.findall(r"%s" % self._episode_re, token, re.IGNORECASE)
if re_res:
self._last_token_type = "episode"
self._continue_flag = False
self._stop_name_flag = True
self.type = MediaType.TV
for se in re_res:
if isinstance(se, tuple):
se_t = None
for se_i in se:
if se_i and str(se_i).isdigit():
se_t = se_i
break
if se_t:
se = int(se_t)
else:
break
else:
se = int(se)
if self.begin_episode is None:
self.begin_episode = se
self.total_episodes = 1
else:
if se > self.begin_episode:
self.end_episode = se
self.total_episodes = (self.end_episode - self.begin_episode) + 1
if self.isfile and self.total_episodes > 2:
self.end_episode = None
self.total_episodes = 1
elif token.isdigit():
try:
int(token)
except ValueError:
return
if self.begin_episode is not None \
and self.end_episode is None \
and len(token) < 5 \
and int(token) > self.begin_episode \
and self._last_token_type == "episode":
self.end_episode = int(token)
self.total_episodes = (self.end_episode - self.begin_episode) + 1
if self.isfile and self.total_episodes > 2:
self.end_episode = None
self.total_episodes = 1
self._continue_flag = False
self.type = MediaType.TV
elif self.begin_episode is None \
and 1 < len(token) < 4 \
and self._last_token_type != "year" \
and self._last_token_type != "videoencode" \
and token != self._unknown_name_str:
self.begin_episode = int(token)
self.total_episodes = 1
self._last_token_type = "episode"
self._continue_flag = False
self._stop_name_flag = True
self.type = MediaType.TV
elif self._last_token_type == "EPISODE" \
and self.begin_episode is None \
and len(token) < 5:
self.begin_episode = int(token)
self.total_episodes = 1
self._last_token_type = "episode"
self._continue_flag = False
self._stop_name_flag = True
self.type = MediaType.TV
elif token.upper() == "EPISODE":
self._last_token_type = "EPISODE"
def __init_resource_type(self, token):
if not self.get_name():
return
source_res = re.search(r"(%s)" % self._source_re, token, re.IGNORECASE)
if source_res:
self._last_token_type = "source"
self._continue_flag = False
self._stop_name_flag = True
if not self._source:
self._source = source_res.group(1)
self._last_token = self._source.upper()
return
elif token.upper() == "DL" \
and self._last_token_type == "source" \
and self._last_token == "WEB":
self._source = "WEB-DL"
self._continue_flag = False
return
elif token.upper() == "RAY" \
and self._last_token_type == "source" \
and self._last_token == "BLU":
self._source = "BluRay"
self._continue_flag = False
return
elif token.upper() == "WEBDL":
self._source = "WEB-DL"
self._continue_flag = False
return
effect_res = re.search(r"(%s)" % self._effect_re, token, re.IGNORECASE)
if effect_res:
self._last_token_type = "effect"
self._continue_flag = False
self._stop_name_flag = True
effect = effect_res.group(1)
if effect not in self._effect:
self._effect.append(effect)
self._last_token = effect.upper()
def __init_video_encode(self, token: str):
if not self.get_name():
return
if not self.year \
and not self.resource_pix \
and not self.resource_type \
and not self.begin_season \
and not self.begin_episode:
return
re_res = re.search(r"(%s)" % self._video_encode_re, token, re.IGNORECASE)
if re_res:
self._continue_flag = False
self._stop_name_flag = True
self._last_token_type = "videoencode"
if not self.video_encode:
self.video_encode = re_res.group(1).upper()
self._last_token = self.video_encode
elif self.video_encode == "10bit":
self.video_encode = f"{re_res.group(1).upper()} 10bit"
self._last_token = re_res.group(1).upper()
elif token.upper() in ['H', 'X']:
self._continue_flag = False
self._stop_name_flag = True
self._last_token_type = "videoencode"
self._last_token = token.upper() if token.upper() == "H" else token.lower()
elif token in ["264", "265"] \
and self._last_token_type == "videoencode" \
and self._last_token in ['H', 'X']:
self.video_encode = "%s%s" % (self._last_token, token)
elif token.isdigit() \
and self._last_token_type == "videoencode" \
and self._last_token in ['VC', 'MPEG']:
self.video_encode = "%s%s" % (self._last_token, token)
elif token.upper() == "10BIT":
self._last_token_type = "videoencode"
if not self.video_encode:
self.video_encode = "10bit"
else:
self.video_encode = f"{self.video_encode} 10bit"
def __init_audio_encode(self, token: str):
if not self.get_name():
return
if not self.year \
and not self.resource_pix \
and not self.resource_type \
and not self.begin_season \
and not self.begin_episode:
return
re_res = re.search(r"(%s)" % self._audio_encode_re, token, re.IGNORECASE)
if re_res:
self._continue_flag = False
self._stop_name_flag = True
self._last_token_type = "audioencode"
self._last_token = re_res.group(1).upper()
if not self.audio_encode:
self.audio_encode = re_res.group(1)
else:
if self.audio_encode.upper() == "DTS":
self.audio_encode = "%s-%s" % (self.audio_encode, re_res.group(1))
else:
self.audio_encode = "%s %s" % (self.audio_encode, re_res.group(1))
elif token.isdigit() \
and self._last_token_type == "audioencode":
if self.audio_encode:
if self._last_token.isdigit():
self.audio_encode = "%s.%s" % (self.audio_encode, token)
elif self.audio_encode[-1].isdigit():
self.audio_encode = "%s %s.%s" % (self.audio_encode[:-1], self.audio_encode[-1], token)
else:
self.audio_encode = "%s %s" % (self.audio_encode, token)
self._last_token = token

View File

@@ -0,0 +1,111 @@
import regex as re
from app.utils.singleton import Singleton
class ReleaseGroupsMatcher(metaclass=Singleton):
"""
识别制作组、字幕组
"""
__release_groups: str = None
custom_release_groups: str = None
custom_separator: str = None
RELEASE_GROUPS: dict = {
"0ff": ['FF(?:(?:A|WE)B|CD|E(?:DU|B)|TV)'],
"1pt": [],
"52pt": [],
"audiences": ['Audies', 'AD(?:Audio|E(?:|book)|Music|Web)'],
"azusa": [],
"beitai": ['BeiTai'],
"btschool": ['Bts(?:CHOOL|HD|PAD|TV)', 'Zone'],
"carpt": ['CarPT'],
"chdbits": ['CHD(?:|Bits|PAD|(?:|HK)TV|WEB)', 'StBOX', 'OneHD', 'Lee', 'xiaopie'],
"discfan": [],
"dragonhd": [],
"eastgame": ['(?:(?:iNT|(?:HALFC|Mini(?:S|H|FH)D))-|)TLF'],
"filelist": [],
"gainbound": ['(?:DG|GBWE)B'],
"hares": ['Hares(?:|(?:M|T)V|Web)'],
"hd4fans": [],
"hdarea": ['HDA(?:pad|rea|TV)', 'EPiC'],
"hdatmos": [],
"hdbd": [],
"hdchina": ['HDC(?:|hina|TV)', 'k9611', 'tudou', 'iHD'],
"hddolby": ['D(?:ream|BTV)', '(?:HD|QHstudI)o'],
"hdfans": ['beAst(?:|TV)'],
"hdhome": ['HDH(?:|ome|Pad|TV|WEB)'],
"hdpt": ['HDPT(?:|Web)'],
"hdsky": ['HDS(?:|ky|TV|Pad|WEB)', 'AQLJ'],
"hdtime": [],
"HDU": [],
"hdvideo": [],
"hdzone": ['HDZ(?:|one)'],
"hhanclub": ['HHWEB'],
"hitpt": [],
"htpt": ['HTPT'],
"iptorrents": [],
"joyhd": [],
"keepfrds": ['FRDS', 'Yumi', 'cXcY'],
"lemonhd": ['L(?:eague(?:(?:C|H)D|(?:M|T)V|NF|WEB)|HD)', 'i18n', 'CiNT'],
"mteam": ['MTeam(?:|TV)', 'MPAD'],
"nanyangpt": [],
"nicept": [],
"oshen": [],
"ourbits": ['Our(?:Bits|TV)', 'FLTTH', 'Ao', 'PbK', 'MGs', 'iLove(?:HD|TV)'],
"piggo": ['PiGo(?:NF|(?:H|WE)B)'],
"ptchina": [],
"pterclub": ['PTer(?:|DIY|Game|(?:M|T)V|WEB)'],
"pthome": ['PTH(?:|Audio|eBook|music|ome|tv|WEB)'],
"ptmsg": [],
"ptsbao": ['PTsbao', 'OPS', 'F(?:Fans(?:AIeNcE|BD|D(?:VD|IY)|TV|WEB)|HDMv)', 'SGXT'],
"pttime": [],
"putao": ['PuTao'],
"soulvoice": [],
"springsunday": ['CMCT(?:|V)'],
"sharkpt": ['Shark(?:|WEB|DIY|TV|MV)'],
"tccf": [],
"tjupt": ['TJUPT'],
"totheglory": ['TTG', 'WiKi', 'NGB', 'DoA', '(?:ARi|ExRE)N'],
"U2": [],
"ultrahd": [],
"others": ['B(?:MDru|eyondHD|TN)', 'C(?:fandora|trlhd|MRG)', 'DON', 'EVO', 'FLUX', 'HONE(?:|yG)',
'N(?:oGroup|T(?:b|G))', 'PandaMoon', 'SMURF', 'T(?:EPES|aengoo|rollHD )'],
"anime": ['ANi', 'HYSUB', 'KTXP', 'LoliHouse', 'MCE', 'Nekomoe kissaten', '(?:Lilith|NC)-Raws', '织梦字幕组']
}
def __init__(self):
release_groups = []
for site_groups in self.RELEASE_GROUPS.values():
for release_group in site_groups:
release_groups.append(release_group)
self.__release_groups = '|'.join(release_groups)
def match(self, title: str = None, groups: str = None):
"""
:param title: 资源标题或文件名
:param groups: 制作组/字幕组
:return: 匹配结果
"""
if not title:
return ""
if not groups:
if self.custom_release_groups:
groups = f"{self.__release_groups}|{self.custom_release_groups}"
else:
groups = self.__release_groups
title = f"{title} "
groups_re = re.compile(r"(?<=[-@\[£【&])(?:%s)(?=[@.\s\]\[】&])" % groups, re.I)
# 处理一个制作组识别多次的情况,保留顺序
unique_groups = []
for item in re.findall(groups_re, title):
if item not in unique_groups:
unique_groups.append(item)
separator = self.custom_separator or "@"
return separator.join(unique_groups)
def update_custom(self, release_groups: str = None, separator: str = None):
"""
更新自定义制作组/字幕组,自定义分隔符
"""
self.custom_release_groups = release_groups
self.custom_separator = separator