feat 支持豆瓣做为识别源

This commit is contained in:
jxxghp
2023-11-09 17:32:26 +08:00
parent 4072799c13
commit 05f1a24199
25 changed files with 690 additions and 239 deletions

View File

@ -9,6 +9,7 @@ from app.core.metainfo import MetaInfo
from app.log import logger
from app.modules import _ModuleBase
from app.modules.douban.apiv2 import DoubanApi
from app.modules.douban.douban_cache import DoubanCache
from app.modules.douban.scraper import DoubanScraper
from app.schemas.types import MediaType
from app.utils.common import retry
@ -18,10 +19,12 @@ from app.utils.system import SystemUtils
class DoubanModule(_ModuleBase):
doubanapi: DoubanApi = None
scraper: DoubanScraper = None
cache: DoubanCache = None
def init_module(self) -> None:
self.doubanapi = DoubanApi()
self.scraper = DoubanScraper()
self.cache = DoubanCache()
def stop(self):
pass
@ -29,10 +32,87 @@ class DoubanModule(_ModuleBase):
def init_setting(self) -> Tuple[str, Union[str, bool]]:
pass
def douban_info(self, doubanid: str) -> Optional[dict]:
def recognize_media(self, meta: MetaBase = None,
mtype: MediaType = None,
doubanid: str = None,
**kwargs) -> Optional[MediaInfo]:
"""
识别媒体信息
:param meta: 识别的元数据
:param mtype: 识别的媒体类型与doubanid配套
:param doubanid: 豆瓣ID
:return: 识别的媒体信息,包括剧集信息
"""
if settings.RECOGNIZE_SOURCE != "douban":
return None
if not meta:
cache_info = {}
else:
if mtype:
meta.type = mtype
cache_info = self.cache.get(meta)
if not cache_info:
# 缓存没有或者强制不使用缓存
if doubanid:
# 直接查询详情
info = self.douban_info(doubanid=doubanid, mtype=mtype or meta.type)
elif meta:
if meta.begin_season:
logger.info(f"正在识别 {meta.name}{meta.begin_season}季 ...")
else:
logger.info(f"正在识别 {meta.name} ...")
# 匹配豆瓣信息
match_info = self.match_doubaninfo(name=meta.name,
mtype=mtype or meta.type,
year=meta.year,
season=meta.begin_season)
if match_info:
# 匹配到豆瓣信息
info = self.douban_info(
doubanid=match_info.get("id"),
mtype=mtype or meta.type
)
else:
logger.info(f"{meta.name if meta else doubanid} 未匹配到豆瓣媒体信息")
return None
else:
logger.error("识别媒体信息时未提供元数据或豆瓣ID")
return None
# 保存到缓存
if meta:
self.cache.update(meta, info)
else:
# 使用缓存信息
if cache_info.get("title"):
logger.info(f"{meta.name} 使用豆瓣识别缓存:{cache_info.get('title')}")
info = self.douban_info(mtype=cache_info.get("type"),
doubanid=cache_info.get("id"))
else:
logger.info(f"{meta.name} 使用豆瓣识别缓存:无法识别")
info = None
if info:
# 赋值TMDB信息并返回
mediainfo = MediaInfo(douban_info=info)
if meta:
logger.info(f"{meta.name} 豆瓣识别结果:{mediainfo.type.value} "
f"{mediainfo.title_year} "
f"{mediainfo.douban_id}")
else:
logger.info(f"{doubanid} 豆瓣识别结果:{mediainfo.type.value} "
f"{mediainfo.title_year}")
return mediainfo
else:
logger.info(f"{meta.name if meta else doubanid} 未匹配到豆瓣媒体信息")
return None
def douban_info(self, doubanid: str, mtype: MediaType = None) -> Optional[dict]:
"""
获取豆瓣信息
:param doubanid: 豆瓣ID
:param mtype: 媒体类型
:return: 豆瓣信息
"""
"""
@ -300,22 +380,40 @@ class DoubanModule(_ModuleBase):
"interest_cmt_earlier_tip_desc": "该短评的发布时间早于公开上映时间,作者可能通过其他渠道提前观看,请谨慎参考。其评分将不计入总评分。"
}
"""
def __douban_tv():
"""
获取豆瓣剧集信息
"""
info = self.doubanapi.tv_detail(doubanid)
if info:
celebrities = self.doubanapi.tv_celebrities(doubanid)
if celebrities:
info["directors"] = celebrities.get("directors")
info["actors"] = celebrities.get("actors")
return info
def __douban_movie():
"""
获取豆瓣电影信息
"""
info = self.doubanapi.movie_detail(doubanid)
if info:
celebrities = self.doubanapi.movie_celebrities(doubanid)
if celebrities:
info["directors"] = celebrities.get("directors")
info["actors"] = celebrities.get("actors")
return info
if not doubanid:
return None
logger.info(f"开始获取豆瓣信息:{doubanid} ...")
douban_info = self.doubanapi.movie_detail(doubanid)
if douban_info:
celebrities = self.doubanapi.movie_celebrities(doubanid)
if celebrities:
douban_info["directors"] = celebrities.get("directors")
douban_info["actors"] = celebrities.get("actors")
if mtype == MediaType.TV:
return __douban_tv()
elif mtype == MediaType.MOVIE:
return __douban_movie()
else:
douban_info = self.doubanapi.tv_detail(doubanid)
celebrities = self.doubanapi.tv_celebrities(doubanid)
if douban_info and celebrities:
douban_info["directors"] = celebrities.get("directors")
douban_info["actors"] = celebrities.get("actors")
return douban_info
return __douban_movie() or __douban_tv()
def douban_discover(self, mtype: MediaType, sort: str, tags: str,
page: int = 1, count: int = 30) -> Optional[List[dict]]:
@ -407,12 +505,12 @@ class DoubanModule(_ModuleBase):
@retry(Exception, 5, 3, 3, logger=logger)
def match_doubaninfo(self, name: str, imdbid: str = None,
mtype: str = None, year: str = None, season: int = None) -> dict:
mtype: MediaType = None, year: str = None, season: int = None) -> dict:
"""
搜索和匹配豆瓣信息
:param name: 名称
:param imdbid: IMDB ID
:param mtype: 类型 电影/电视剧
:param mtype: 类型
:param year: 年份
:param season: 季号
"""
@ -441,7 +539,7 @@ class DoubanModule(_ModuleBase):
type_name = item_obj.get("type_name")
if type_name not in [MediaType.TV.value, MediaType.MOVIE.value]:
continue
if mtype and mtype != type_name:
if mtype and mtype.value != type_name:
continue
if mtype == MediaType.TV and not season:
season = 1
@ -486,17 +584,20 @@ class DoubanModule(_ModuleBase):
meta = MetaInfo(path.stem)
if not meta.name:
return
# 根据名称查询豆瓣数据
doubaninfo = self.match_doubaninfo(name=mediainfo.title,
imdbid=mediainfo.imdb_id,
mtype=mediainfo.type.value,
year=mediainfo.year,
season=meta.begin_season)
if not doubaninfo:
logger.warn(f"未找到 {mediainfo.title} 的豆瓣信息")
return
# 查询豆瓣详情
doubaninfo = self.douban_info(doubaninfo.get("id"))
if not mediainfo.douban_id:
# 根据名称查询豆瓣数据
doubaninfo = self.match_doubaninfo(name=mediainfo.title,
imdbid=mediainfo.imdb_id,
mtype=mediainfo.type,
year=mediainfo.year)
if not doubaninfo:
logger.warn(f"未找到 {mediainfo.title} 的豆瓣信息")
return
doubaninfo = self.douban_info(doubanid=doubaninfo.get("id"), mtype=mediainfo.type)
else:
doubaninfo = self.douban_info(doubanid=mediainfo.douban_id,
mtype=mediainfo.type)
# 刮削路径
scrape_path = path / path.name
self.scraper.gen_scraper_files(meta=meta,
@ -513,17 +614,21 @@ class DoubanModule(_ModuleBase):
meta = MetaInfo(file.stem)
if not meta.name:
continue
# 根据名称查询豆瓣数据
doubaninfo = self.match_doubaninfo(name=mediainfo.title,
imdbid=mediainfo.imdb_id,
mtype=mediainfo.type.value,
year=mediainfo.year,
season=meta.begin_season)
if not doubaninfo:
logger.warn(f"未找到 {mediainfo.title} 的豆瓣信息")
break
# 查询豆瓣详情
doubaninfo = self.douban_info(doubaninfo.get("id"))
if not mediainfo.douban_id:
# 根据名称查询豆瓣数据
doubaninfo = self.match_doubaninfo(name=mediainfo.title,
imdbid=mediainfo.imdb_id,
mtype=mediainfo.type,
year=mediainfo.year,
season=meta.begin_season)
if not doubaninfo:
logger.warn(f"未找到 {mediainfo.title} 的豆瓣信息")
break
# 查询豆瓣详情
doubaninfo = self.douban_info(doubanid=doubaninfo.get("id"), mtype=mediainfo.type)
else:
doubaninfo = self.douban_info(doubanid=mediainfo.douban_id,
mtype=mediainfo.type)
# 刮削
self.scraper.gen_scraper_files(meta=meta,
mediainfo=MediaInfo(douban_info=doubaninfo),
@ -532,3 +637,10 @@ class DoubanModule(_ModuleBase):
except Exception as e:
logger.error(f"刮削文件 {file} 失败,原因:{str(e)}")
logger.info(f"{path} 刮削完成")
def clear_cache(self):
"""
清除缓存
"""
self.doubanapi.clear_cache()
self.cache.clear()

View File

@ -427,6 +427,12 @@ class DoubanApi(metaclass=Singleton):
return self.__invoke(self._urls["doulist_items"] % subject_id,
start=start, count=count, _ts=ts)
def clear_cache(self):
"""
清空LRU缓存
"""
self.__invoke.cache_clear()
def __del__(self):
if self._session:
self._session.close()

View File

@ -0,0 +1,228 @@
import pickle
import random
import time
from pathlib import Path
from threading import RLock
from typing import Optional
from app.core.config import settings
from app.core.meta import MetaBase
from app.utils.singleton import Singleton
from app.schemas.types import MediaType
lock = RLock()
CACHE_EXPIRE_TIMESTAMP_STR = "cache_expire_timestamp"
EXPIRE_TIMESTAMP = settings.CACHE_CONF.get('meta')
class DoubanCache(metaclass=Singleton):
"""
豆瓣缓存数据
{
"id": '',
"title": '',
"year": '',
"type": MediaType
}
"""
_meta_data: dict = {}
# 缓存文件路径
_meta_path: Path = None
# TMDB缓存过期
_tmdb_cache_expire: bool = True
def __init__(self):
self._meta_path = settings.TEMP_PATH / "__douban_cache__"
self._meta_data = self.__load(self._meta_path)
def clear(self):
"""
清空所有TMDB缓存
"""
with lock:
self._meta_data = {}
@staticmethod
def __get_key(meta: MetaBase) -> str:
"""
获取缓存KEY
"""
return f"[{meta.type.value if meta.type else '未知'}]{meta.name}-{meta.year}-{meta.begin_season}"
def get(self, meta: MetaBase):
"""
根据KEY值获取缓存值
"""
key = self.__get_key(meta)
with lock:
info: dict = self._meta_data.get(key)
if info:
expire = info.get(CACHE_EXPIRE_TIMESTAMP_STR)
if not expire or int(time.time()) < expire:
info[CACHE_EXPIRE_TIMESTAMP_STR] = int(time.time()) + EXPIRE_TIMESTAMP
self._meta_data[key] = info
elif expire and self._tmdb_cache_expire:
self.delete(key)
return info or {}
def delete(self, key: str) -> dict:
"""
删除缓存信息
@param key: 缓存key
@return: 被删除的缓存内容
"""
with lock:
return self._meta_data.pop(key, None)
def delete_by_doubanid(self, doubanid: str) -> None:
"""
清空对应豆瓣ID的所有缓存记录以强制更新TMDB中最新的数据
"""
for key in list(self._meta_data):
if self._meta_data.get(key, {}).get("id") == doubanid:
with lock:
self._meta_data.pop(key)
def delete_unknown(self) -> None:
"""
清除未识别的缓存记录以便重新搜索TMDB
"""
for key in list(self._meta_data):
if self._meta_data.get(key, {}).get("id") == "0":
with lock:
self._meta_data.pop(key)
def modify(self, key: str, title: str) -> dict:
"""
删除缓存信息
@param key: 缓存key
@param title: 标题
@return: 被修改后缓存内容
"""
with lock:
if self._meta_data.get(key):
self._meta_data[key]['title'] = title
self._meta_data[key][CACHE_EXPIRE_TIMESTAMP_STR] = int(time.time()) + EXPIRE_TIMESTAMP
return self._meta_data.get(key)
@staticmethod
def __load(path: Path) -> dict:
"""
从文件中加载缓存
"""
try:
if path.exists():
with open(path, 'rb') as f:
data = pickle.load(f)
return data
return {}
except Exception as e:
print(str(e))
return {}
def update(self, meta: MetaBase, info: dict) -> None:
"""
新增或更新缓存条目
"""
with lock:
if info:
# 缓存标题
cache_title = info.get("title") \
if info.get("media_type") == MediaType.MOVIE else info.get("name")
# 缓存年份
cache_year = info.get('release_date') \
if info.get("media_type") == MediaType.MOVIE else info.get('first_air_date')
if cache_year:
cache_year = cache_year[:4]
# 类型
if isinstance(info.get('media_type'), MediaType):
mtype = info.get('media_type')
else:
mtype = MediaType.MOVIE if info.get("type") == "movie" else MediaType.TV
# 海报
poster_path = info.get("pic", {}).get("large")
if not poster_path and info.get("cover_url"):
poster_path = info.get("cover_url")
if not poster_path and info.get("cover"):
poster_path = info.get("cover").get("url")
self._meta_data[self.__get_key(meta)] = {
"id": info.get("id"),
"type": mtype,
"year": cache_year,
"title": cache_title,
"poster_path": poster_path,
CACHE_EXPIRE_TIMESTAMP_STR: int(time.time()) + EXPIRE_TIMESTAMP
}
elif info is not None:
# None时不缓存此时代表网络错误允许重复请求
self._meta_data[self.__get_key(meta)] = {'id': "0"}
def save(self, force: bool = False) -> None:
"""
保存缓存数据到文件
"""
meta_data = self.__load(self._meta_path)
new_meta_data = {k: v for k, v in self._meta_data.items() if v.get("id")}
if not force \
and not self._random_sample(new_meta_data) \
and meta_data.keys() == new_meta_data.keys():
return
with open(self._meta_path, 'wb') as f:
pickle.dump(new_meta_data, f, pickle.HIGHEST_PROTOCOL)
def _random_sample(self, new_meta_data: dict) -> bool:
"""
采样分析是否需要保存
"""
ret = False
if len(new_meta_data) < 25:
keys = list(new_meta_data.keys())
for k in keys:
info = new_meta_data.get(k)
expire = info.get(CACHE_EXPIRE_TIMESTAMP_STR)
if not expire:
ret = True
info[CACHE_EXPIRE_TIMESTAMP_STR] = int(time.time()) + EXPIRE_TIMESTAMP
elif int(time.time()) >= expire:
ret = True
if self._tmdb_cache_expire:
new_meta_data.pop(k)
else:
count = 0
keys = random.sample(sorted(new_meta_data.keys()), 25)
for k in keys:
info = new_meta_data.get(k)
expire = info.get(CACHE_EXPIRE_TIMESTAMP_STR)
if not expire:
ret = True
info[CACHE_EXPIRE_TIMESTAMP_STR] = int(time.time()) + EXPIRE_TIMESTAMP
elif int(time.time()) >= expire:
ret = True
if self._tmdb_cache_expire:
new_meta_data.pop(k)
count += 1
if count >= 5:
ret |= self._random_sample(new_meta_data)
return ret
def get_title(self, key: str) -> Optional[str]:
"""
获取缓存的标题
"""
cache_media_info = self._meta_data.get(key)
if not cache_media_info or not cache_media_info.get("id"):
return None
return cache_media_info.get("title")
def set_title(self, key: str, cn_title: str) -> None:
"""
重新设置缓存标题
"""
cache_media_info = self._meta_data.get(key)
if not cache_media_info:
return
self._meta_data[key]['title'] = cn_title

View File

@ -326,17 +326,19 @@ class FanartModule(_ModuleBase):
:param mediainfo: 识别的媒体信息
:return: 更新后的媒体信息
"""
if not mediainfo.tmdb_id and not mediainfo.tvdb_id:
return None
if mediainfo.type == MediaType.MOVIE:
result = self.__request_fanart(mediainfo.type, mediainfo.tmdb_id)
else:
if mediainfo.tvdb_id:
result = self.__request_fanart(mediainfo.type, mediainfo.tvdb_id)
else:
logger.info(f"{mediainfo.title_year} 没有tvdbid无法获取Fanart图片")
return
logger.info(f"{mediainfo.title_year} 没有tvdbid无法获取fanart图片")
return None
if not result or result.get('status') == 'error':
logger.warn(f"没有获取到 {mediainfo.title_year}Fanart图片数据")
return
logger.warn(f"没有获取到 {mediainfo.title_year}fanart图片数据")
return None
# 获取所有图片
for name, images in result.items():
if not images:

View File

@ -43,7 +43,8 @@ class TheMovieDbModule(_ModuleBase):
def recognize_media(self, meta: MetaBase = None,
mtype: MediaType = None,
tmdbid: int = None) -> Optional[MediaInfo]:
tmdbid: int = None,
**kwargs) -> Optional[MediaInfo]:
"""
识别媒体信息
:param meta: 识别的元数据
@ -51,6 +52,9 @@ class TheMovieDbModule(_ModuleBase):
:param tmdbid: tmdbid
:return: 识别的媒体信息,包括剧集信息
"""
if settings.RECOGNIZE_SOURCE != "themoviedb":
return None
if not meta:
cache_info = {}
else:
@ -112,11 +116,11 @@ class TheMovieDbModule(_ModuleBase):
else:
# 使用缓存信息
if cache_info.get("title"):
logger.info(f"{meta.name} 使用识别缓存:{cache_info.get('title')}")
logger.info(f"{meta.name} 使用TMDB识别缓存:{cache_info.get('title')}")
info = self.tmdb.get_info(mtype=cache_info.get("type"),
tmdbid=cache_info.get("id"))
else:
logger.info(f"{meta.name} 使用识别缓存:无法识别")
logger.info(f"{meta.name} 使用TMDB识别缓存:无法识别")
info = None
if info:
@ -129,11 +133,11 @@ class TheMovieDbModule(_ModuleBase):
mediainfo = MediaInfo(tmdb_info=info)
mediainfo.set_category(cat)
if meta:
logger.info(f"{meta.name} 识别结果:{mediainfo.type.value} "
logger.info(f"{meta.name} TMDB识别结果:{mediainfo.type.value} "
f"{mediainfo.title_year} "
f"{mediainfo.tmdb_id}")
else:
logger.info(f"{tmdbid} 识别结果:{mediainfo.type.value} "
logger.info(f"{tmdbid} TMDB识别结果:{mediainfo.type.value} "
f"{mediainfo.title_year}")
# 补充剧集年份
@ -143,10 +147,31 @@ class TheMovieDbModule(_ModuleBase):
mediainfo.season_years = episode_years
return mediainfo
else:
logger.info(f"{meta.name if meta else tmdbid} 未匹配到媒体信息")
logger.info(f"{meta.name if meta else tmdbid} 未匹配到TMDB媒体信息")
return None
def match_doubaninfo(self, name: str, mtype: MediaType = None,
year: str = None, season: int = None) -> dict:
"""
搜索和匹配TMDB信息
:param name: 名称
:param mtype: 类型
:param year: 年份
:param season: 季号
"""
# 搜索
logger.info(f"开始使用 名称:{name}、年份:{year} 匹配TMDB信息 ...")
info = self.tmdb.match(name=name,
year=year,
mtype=mtype,
season_year=year,
season_number=season)
if info and not info.get("genres"):
info = self.tmdb.get_info(mtype=info.get("media_type"),
tmdbid=info.get("id"))
return info
def tmdb_info(self, tmdbid: int, mtype: MediaType) -> Optional[dict]:
"""
获取TMDB信息