feat 支持豆瓣做为识别源

2023-11-09 17:32:26 +08:00
parent 4072799c13
commit 05f1a24199
25 changed files with 690 additions and 239 deletions
--- a/app/modules/douban/init.py
+++ b/app/modules/douban/init.py
@@ -9,6 +9,7 @@ from app.core.metainfo import MetaInfo
 from app.log import logger
 from app.modules import _ModuleBase
 from app.modules.douban.apiv2 import DoubanApi
+from app.modules.douban.douban_cache import DoubanCache
 from app.modules.douban.scraper import DoubanScraper
 from app.schemas.types import MediaType
 from app.utils.common import retry
@@ -18,10 +19,12 @@ from app.utils.system import SystemUtils
 class DoubanModule(_ModuleBase):
    doubanapi: DoubanApi = None
    scraper: DoubanScraper = None
+    cache: DoubanCache = None

    def init_module(self) -> None:
        self.doubanapi = DoubanApi()
        self.scraper = DoubanScraper()
+        self.cache = DoubanCache()

    def stop(self):
        pass
@@ -29,10 +32,87 @@ class DoubanModule(_ModuleBase):
    def init_setting(self) -> Tuple[str, Union[str, bool]]:
        pass

-    def douban_info(self, doubanid: str) -> Optional[dict]:
+    def recognize_media(self, meta: MetaBase = None,
+                        mtype: MediaType = None,
+                        doubanid: str = None,
+                        **kwargs) -> Optional[MediaInfo]:
+        """
+        识别媒体信息
+        :param meta:     识别的元数据
+        :param mtype:    识别的媒体类型，与doubanid配套
+        :param doubanid: 豆瓣ID
+        :return: 识别的媒体信息，包括剧集信息
+        """
+        if settings.RECOGNIZE_SOURCE != "douban":
+            return None
+
+        if not meta:
+            cache_info = {}
+        else:
+            if mtype:
+                meta.type = mtype
+            cache_info = self.cache.get(meta)
+        if not cache_info:
+            # 缓存没有或者强制不使用缓存
+            if doubanid:
+                # 直接查询详情
+                info = self.douban_info(doubanid=doubanid, mtype=mtype or meta.type)
+            elif meta:
+                if meta.begin_season:
+                    logger.info(f"正在识别 {meta.name} 第{meta.begin_season}季 ...")
+                else:
+                    logger.info(f"正在识别 {meta.name} ...")
+                # 匹配豆瓣信息
+                match_info = self.match_doubaninfo(name=meta.name,
+                                                   mtype=mtype or meta.type,
+                                                   year=meta.year,
+                                                   season=meta.begin_season)
+                if match_info:
+                    # 匹配到豆瓣信息
+                    info = self.douban_info(
+                        doubanid=match_info.get("id"),
+                        mtype=mtype or meta.type
+                    )
+                else:
+                    logger.info(f"{meta.name if meta else doubanid} 未匹配到豆瓣媒体信息")
+                    return None
+            else:
+                logger.error("识别媒体信息时未提供元数据或豆瓣ID")
+                return None
+            # 保存到缓存
+            if meta:
+                self.cache.update(meta, info)
+        else:
+            # 使用缓存信息
+            if cache_info.get("title"):
+                logger.info(f"{meta.name} 使用豆瓣识别缓存：{cache_info.get('title')}")
+                info = self.douban_info(mtype=cache_info.get("type"),
+                                        doubanid=cache_info.get("id"))
+            else:
+                logger.info(f"{meta.name} 使用豆瓣识别缓存：无法识别")
+                info = None
+
+        if info:
+            # 赋值TMDB信息并返回
+            mediainfo = MediaInfo(douban_info=info)
+            if meta:
+                logger.info(f"{meta.name} 豆瓣识别结果：{mediainfo.type.value} "
+                            f"{mediainfo.title_year} "
+                            f"{mediainfo.douban_id}")
+            else:
+                logger.info(f"{doubanid} 豆瓣识别结果：{mediainfo.type.value} "
+                            f"{mediainfo.title_year}")
+            return mediainfo
+        else:
+            logger.info(f"{meta.name if meta else doubanid} 未匹配到豆瓣媒体信息")
+
+        return None
+
+    def douban_info(self, doubanid: str, mtype: MediaType = None) -> Optional[dict]:
        """
        获取豆瓣信息
        :param doubanid: 豆瓣ID
+        :param mtype:    媒体类型
        :return: 豆瓣信息
        """
        """
@@ -300,22 +380,40 @@ class DoubanModule(_ModuleBase):
          "interest_cmt_earlier_tip_desc": "该短评的发布时间早于公开上映时间，作者可能通过其他渠道提前观看，请谨慎参考。其评分将不计入总评分。"
        }
        """
+
+        def __douban_tv():
+            """
+            获取豆瓣剧集信息
+            """
+            info = self.doubanapi.tv_detail(doubanid)
+            if info:
+                celebrities = self.doubanapi.tv_celebrities(doubanid)
+                if celebrities:
+                    info["directors"] = celebrities.get("directors")
+                    info["actors"] = celebrities.get("actors")
+            return info
+
+        def __douban_movie():
+            """
+            获取豆瓣电影信息
+            """
+            info = self.doubanapi.movie_detail(doubanid)
+            if info:
+                celebrities = self.doubanapi.movie_celebrities(doubanid)
+                if celebrities:
+                    info["directors"] = celebrities.get("directors")
+                    info["actors"] = celebrities.get("actors")
+            return info
+
        if not doubanid:
            return None
        logger.info(f"开始获取豆瓣信息：{doubanid} ...")
-        douban_info = self.doubanapi.movie_detail(doubanid)
-        if douban_info:
-            celebrities = self.doubanapi.movie_celebrities(doubanid)
-            if celebrities:
-                douban_info["directors"] = celebrities.get("directors")
-                douban_info["actors"] = celebrities.get("actors")
+        if mtype == MediaType.TV:
+            return __douban_tv()
+        elif mtype == MediaType.MOVIE:
+            return __douban_movie()
        else:
-            douban_info = self.doubanapi.tv_detail(doubanid)
-            celebrities = self.doubanapi.tv_celebrities(doubanid)
-            if douban_info and celebrities:
-                douban_info["directors"] = celebrities.get("directors")
-                douban_info["actors"] = celebrities.get("actors")
-        return douban_info
+            return __douban_movie() or __douban_tv()

    def douban_discover(self, mtype: MediaType, sort: str, tags: str,
                        page: int = 1, count: int = 30) -> Optional[List[dict]]:
@@ -407,12 +505,12 @@ class DoubanModule(_ModuleBase):

    @retry(Exception, 5, 3, 3, logger=logger)
    def match_doubaninfo(self, name: str, imdbid: str = None,
-                         mtype: str = None, year: str = None, season: int = None) -> dict:
+                         mtype: MediaType = None, year: str = None, season: int = None) -> dict:
        """
        搜索和匹配豆瓣信息
        :param name:  名称
        :param imdbid:  IMDB ID
-        :param mtype:  类型 电影/电视剧
+        :param mtype:  类型
        :param year:  年份
        :param season:  季号
        """
@@ -441,7 +539,7 @@ class DoubanModule(_ModuleBase):
            type_name = item_obj.get("type_name")
            if type_name not in [MediaType.TV.value, MediaType.MOVIE.value]:
                continue
-            if mtype and mtype != type_name:
+            if mtype and mtype.value != type_name:
                continue
            if mtype == MediaType.TV and not season:
                season = 1
@@ -486,17 +584,20 @@ class DoubanModule(_ModuleBase):
            meta = MetaInfo(path.stem)
            if not meta.name:
                return
-            # 根据名称查询豆瓣数据
-            doubaninfo = self.match_doubaninfo(name=mediainfo.title,
-                                               imdbid=mediainfo.imdb_id,
-                                               mtype=mediainfo.type.value,
-                                               year=mediainfo.year,
-                                               season=meta.begin_season)
-            if not doubaninfo:
-                logger.warn(f"未找到 {mediainfo.title} 的豆瓣信息")
-                return
            # 查询豆瓣详情
-            doubaninfo = self.douban_info(doubaninfo.get("id"))
+            if not mediainfo.douban_id:
+                # 根据名称查询豆瓣数据
+                doubaninfo = self.match_doubaninfo(name=mediainfo.title,
+                                                   imdbid=mediainfo.imdb_id,
+                                                   mtype=mediainfo.type,
+                                                   year=mediainfo.year)
+                if not doubaninfo:
+                    logger.warn(f"未找到 {mediainfo.title} 的豆瓣信息")
+                    return
+                doubaninfo = self.douban_info(doubanid=doubaninfo.get("id"), mtype=mediainfo.type)
+            else:
+                doubaninfo = self.douban_info(doubanid=mediainfo.douban_id,
+                                              mtype=mediainfo.type)
            # 刮削路径
            scrape_path = path / path.name
            self.scraper.gen_scraper_files(meta=meta,
@@ -513,17 +614,21 @@ class DoubanModule(_ModuleBase):
                    meta = MetaInfo(file.stem)
                    if not meta.name:
                        continue
-                    # 根据名称查询豆瓣数据
-                    doubaninfo = self.match_doubaninfo(name=mediainfo.title,
-                                                       imdbid=mediainfo.imdb_id,
-                                                       mtype=mediainfo.type.value,
-                                                       year=mediainfo.year,
-                                                       season=meta.begin_season)
-                    if not doubaninfo:
-                        logger.warn(f"未找到 {mediainfo.title} 的豆瓣信息")
-                        break
-                    # 查询豆瓣详情
-                    doubaninfo = self.douban_info(doubaninfo.get("id"))
+                    if not mediainfo.douban_id:
+                        # 根据名称查询豆瓣数据
+                        doubaninfo = self.match_doubaninfo(name=mediainfo.title,
+                                                           imdbid=mediainfo.imdb_id,
+                                                           mtype=mediainfo.type,
+                                                           year=mediainfo.year,
+                                                           season=meta.begin_season)
+                        if not doubaninfo:
+                            logger.warn(f"未找到 {mediainfo.title} 的豆瓣信息")
+                            break
+                        # 查询豆瓣详情
+                        doubaninfo = self.douban_info(doubanid=doubaninfo.get("id"), mtype=mediainfo.type)
+                    else:
+                        doubaninfo = self.douban_info(doubanid=mediainfo.douban_id,
+                                                      mtype=mediainfo.type)
                    # 刮削
                    self.scraper.gen_scraper_files(meta=meta,
                                                   mediainfo=MediaInfo(douban_info=doubaninfo),
@@ -532,3 +637,10 @@ class DoubanModule(_ModuleBase):
                except Exception as e:
                    logger.error(f"刮削文件 {file} 失败，原因：{str(e)}")
        logger.info(f"{path} 刮削完成")
+
+    def clear_cache(self):
+        """
+        清除缓存
+        """
+        self.doubanapi.clear_cache()
+        self.cache.clear()
--- a/app/modules/douban/apiv2.py
+++ b/app/modules/douban/apiv2.py
@@ -427,6 +427,12 @@ class DoubanApi(metaclass=Singleton):
        return self.__invoke(self._urls["doulist_items"] % subject_id,
                             start=start, count=count, _ts=ts)

+    def clear_cache(self):
+        """
+        清空LRU缓存
+        """
+        self.__invoke.cache_clear()
+
    def __del__(self):
        if self._session:
            self._session.close()
--- a/app/modules/douban/douban_cache.py
+++ b/app/modules/douban/douban_cache.py
@@ -0,0 +1,228 @@
+import pickle
+import random
+import time
+from pathlib import Path
+from threading import RLock
+from typing import Optional
+
+from app.core.config import settings
+from app.core.meta import MetaBase
+from app.utils.singleton import Singleton
+from app.schemas.types import MediaType
+
+lock = RLock()
+
+CACHE_EXPIRE_TIMESTAMP_STR = "cache_expire_timestamp"
+EXPIRE_TIMESTAMP = settings.CACHE_CONF.get('meta')
+
+
+class DoubanCache(metaclass=Singleton):
+    """
+    豆瓣缓存数据
+    {
+        "id": '',
+        "title": '',
+        "year": '',
+        "type": MediaType
+    }
+    """
+    _meta_data: dict = {}
+    # 缓存文件路径
+    _meta_path: Path = None
+    # TMDB缓存过期
+    _tmdb_cache_expire: bool = True
+
+    def __init__(self):
+        self._meta_path = settings.TEMP_PATH / "__douban_cache__"
+        self._meta_data = self.__load(self._meta_path)
+
+    def clear(self):
+        """
+        清空所有TMDB缓存
+        """
+        with lock:
+            self._meta_data = {}
+
+    @staticmethod
+    def __get_key(meta: MetaBase) -> str:
+        """
+        获取缓存KEY
+        """
+        return f"[{meta.type.value if meta.type else '未知'}]{meta.name}-{meta.year}-{meta.begin_season}"
+
+    def get(self, meta: MetaBase):
+        """
+        根据KEY值获取缓存值
+        """
+        key = self.__get_key(meta)
+        with lock:
+            info: dict = self._meta_data.get(key)
+            if info:
+                expire = info.get(CACHE_EXPIRE_TIMESTAMP_STR)
+                if not expire or int(time.time()) < expire:
+                    info[CACHE_EXPIRE_TIMESTAMP_STR] = int(time.time()) + EXPIRE_TIMESTAMP
+                    self._meta_data[key] = info
+                elif expire and self._tmdb_cache_expire:
+                    self.delete(key)
+            return info or {}
+
+    def delete(self, key: str) -> dict:
+        """
+        删除缓存信息
+        @param key: 缓存key
+        @return: 被删除的缓存内容
+        """
+        with lock:
+            return self._meta_data.pop(key, None)
+
+    def delete_by_doubanid(self, doubanid: str) -> None:
+        """
+        清空对应豆瓣ID的所有缓存记录，以强制更新TMDB中最新的数据
+        """
+        for key in list(self._meta_data):
+            if self._meta_data.get(key, {}).get("id") == doubanid:
+                with lock:
+                    self._meta_data.pop(key)
+
+    def delete_unknown(self) -> None:
+        """
+        清除未识别的缓存记录，以便重新搜索TMDB
+        """
+        for key in list(self._meta_data):
+            if self._meta_data.get(key, {}).get("id") == "0":
+                with lock:
+                    self._meta_data.pop(key)
+
+    def modify(self, key: str, title: str) -> dict:
+        """
+        删除缓存信息
+        @param key: 缓存key
+        @param title: 标题
+        @return: 被修改后缓存内容
+        """
+        with lock:
+            if self._meta_data.get(key):
+                self._meta_data[key]['title'] = title
+                self._meta_data[key][CACHE_EXPIRE_TIMESTAMP_STR] = int(time.time()) + EXPIRE_TIMESTAMP
+            return self._meta_data.get(key)
+
+    @staticmethod
+    def __load(path: Path) -> dict:
+        """
+        从文件中加载缓存
+        """
+        try:
+            if path.exists():
+                with open(path, 'rb') as f:
+                    data = pickle.load(f)
+                return data
+            return {}
+        except Exception as e:
+            print(str(e))
+            return {}
+
+    def update(self, meta: MetaBase, info: dict) -> None:
+        """
+        新增或更新缓存条目
+        """
+        with lock:
+            if info:
+                # 缓存标题
+                cache_title = info.get("title") \
+                    if info.get("media_type") == MediaType.MOVIE else info.get("name")
+                # 缓存年份
+                cache_year = info.get('release_date') \
+                    if info.get("media_type") == MediaType.MOVIE else info.get('first_air_date')
+                if cache_year:
+                    cache_year = cache_year[:4]
+                # 类型
+                if isinstance(info.get('media_type'), MediaType):
+                    mtype = info.get('media_type')
+                else:
+                    mtype = MediaType.MOVIE if info.get("type") == "movie" else MediaType.TV
+                # 海报
+                poster_path = info.get("pic", {}).get("large")
+                if not poster_path and info.get("cover_url"):
+                    poster_path = info.get("cover_url")
+                if not poster_path and info.get("cover"):
+                    poster_path = info.get("cover").get("url")
+                self._meta_data[self.__get_key(meta)] = {
+                        "id": info.get("id"),
+                        "type": mtype,
+                        "year": cache_year,
+                        "title": cache_title,
+                        "poster_path": poster_path,
+                        CACHE_EXPIRE_TIMESTAMP_STR: int(time.time()) + EXPIRE_TIMESTAMP
+                    }
+            elif info is not None:
+                # None时不缓存，此时代表网络错误，允许重复请求
+                self._meta_data[self.__get_key(meta)] = {'id': "0"}
+
+    def save(self, force: bool = False) -> None:
+        """
+        保存缓存数据到文件
+        """
+
+        meta_data = self.__load(self._meta_path)
+        new_meta_data = {k: v for k, v in self._meta_data.items() if v.get("id")}
+
+        if not force \
+                and not self._random_sample(new_meta_data) \
+                and meta_data.keys() == new_meta_data.keys():
+            return
+
+        with open(self._meta_path, 'wb') as f:
+            pickle.dump(new_meta_data, f, pickle.HIGHEST_PROTOCOL)
+
+    def _random_sample(self, new_meta_data: dict) -> bool:
+        """
+        采样分析是否需要保存
+        """
+        ret = False
+        if len(new_meta_data) < 25:
+            keys = list(new_meta_data.keys())
+            for k in keys:
+                info = new_meta_data.get(k)
+                expire = info.get(CACHE_EXPIRE_TIMESTAMP_STR)
+                if not expire:
+                    ret = True
+                    info[CACHE_EXPIRE_TIMESTAMP_STR] = int(time.time()) + EXPIRE_TIMESTAMP
+                elif int(time.time()) >= expire:
+                    ret = True
+                    if self._tmdb_cache_expire:
+                        new_meta_data.pop(k)
+        else:
+            count = 0
+            keys = random.sample(sorted(new_meta_data.keys()), 25)
+            for k in keys:
+                info = new_meta_data.get(k)
+                expire = info.get(CACHE_EXPIRE_TIMESTAMP_STR)
+                if not expire:
+                    ret = True
+                    info[CACHE_EXPIRE_TIMESTAMP_STR] = int(time.time()) + EXPIRE_TIMESTAMP
+                elif int(time.time()) >= expire:
+                    ret = True
+                    if self._tmdb_cache_expire:
+                        new_meta_data.pop(k)
+                        count += 1
+            if count >= 5:
+                ret |= self._random_sample(new_meta_data)
+        return ret
+
+    def get_title(self, key: str) -> Optional[str]:
+        """
+        获取缓存的标题
+        """
+        cache_media_info = self._meta_data.get(key)
+        if not cache_media_info or not cache_media_info.get("id"):
+            return None
+        return cache_media_info.get("title")
+
+    def set_title(self, key: str, cn_title: str) -> None:
+        """
+        重新设置缓存标题
+        """
+        cache_media_info = self._meta_data.get(key)
+        if not cache_media_info:
+            return
+        self._meta_data[key]['title'] = cn_title
--- a/app/modules/fanart/init.py
+++ b/app/modules/fanart/init.py
@@ -326,17 +326,19 @@ class FanartModule(_ModuleBase):
        :param mediainfo:  识别的媒体信息
        :return: 更新后的媒体信息
        """
+        if not mediainfo.tmdb_id and not mediainfo.tvdb_id:
+            return None
        if mediainfo.type == MediaType.MOVIE:
            result = self.__request_fanart(mediainfo.type, mediainfo.tmdb_id)
        else:
            if mediainfo.tvdb_id:
                result = self.__request_fanart(mediainfo.type, mediainfo.tvdb_id)
            else:
-                logger.info(f"{mediainfo.title_year} 没有tvdbid，无法获取Fanart图片")
-                return
+                logger.info(f"{mediainfo.title_year} 没有tvdbid，无法获取fanart图片")
+                return None
        if not result or result.get('status') == 'error':
-            logger.warn(f"没有获取到 {mediainfo.title_year} 的Fanart图片数据")
-            return
+            logger.warn(f"没有获取到 {mediainfo.title_year} 的fanart图片数据")
+            return None
        # 获取所有图片
        for name, images in result.items():
            if not images:
--- a/app/modules/themoviedb/init.py
+++ b/app/modules/themoviedb/init.py
@@ -43,7 +43,8 @@ class TheMovieDbModule(_ModuleBase):

    def recognize_media(self, meta: MetaBase = None,
                        mtype: MediaType = None,
-                        tmdbid: int = None) -> Optional[MediaInfo]:
+                        tmdbid: int = None,
+                        **kwargs) -> Optional[MediaInfo]:
        """
        识别媒体信息
        :param meta:     识别的元数据
@@ -51,6 +52,9 @@ class TheMovieDbModule(_ModuleBase):
        :param tmdbid:   tmdbid
        :return: 识别的媒体信息，包括剧集信息
        """
+        if settings.RECOGNIZE_SOURCE != "themoviedb":
+            return None
+
        if not meta:
            cache_info = {}
        else:
@@ -112,11 +116,11 @@ class TheMovieDbModule(_ModuleBase):
        else:
            # 使用缓存信息
            if cache_info.get("title"):
-                logger.info(f"{meta.name} 使用识别缓存：{cache_info.get('title')}")
+                logger.info(f"{meta.name} 使用TMDB识别缓存：{cache_info.get('title')}")
                info = self.tmdb.get_info(mtype=cache_info.get("type"),
                                          tmdbid=cache_info.get("id"))
            else:
-                logger.info(f"{meta.name} 使用识别缓存：无法识别")
+                logger.info(f"{meta.name} 使用TMDB识别缓存：无法识别")
                info = None

        if info:
@@ -129,11 +133,11 @@ class TheMovieDbModule(_ModuleBase):
            mediainfo = MediaInfo(tmdb_info=info)
            mediainfo.set_category(cat)
            if meta:
-                logger.info(f"{meta.name} 识别结果：{mediainfo.type.value} "
+                logger.info(f"{meta.name} TMDB识别结果：{mediainfo.type.value} "
                            f"{mediainfo.title_year} "
                            f"{mediainfo.tmdb_id}")
            else:
-                logger.info(f"{tmdbid} 识别结果：{mediainfo.type.value} "
+                logger.info(f"{tmdbid} TMDB识别结果：{mediainfo.type.value} "
                            f"{mediainfo.title_year}")

            # 补充剧集年份
@@ -143,10 +147,31 @@ class TheMovieDbModule(_ModuleBase):
                    mediainfo.season_years = episode_years
            return mediainfo
        else:
-            logger.info(f"{meta.name if meta else tmdbid} 未匹配到媒体信息")
+            logger.info(f"{meta.name if meta else tmdbid} 未匹配到TMDB媒体信息")

        return None

+    def match_doubaninfo(self, name: str, mtype: MediaType = None,
+                         year: str = None, season: int = None) -> dict:
+        """
+        搜索和匹配TMDB信息
+        :param name:  名称
+        :param mtype:  类型
+        :param year:  年份
+        :param season:  季号
+        """
+        # 搜索
+        logger.info(f"开始使用 名称：{name}、年份：{year} 匹配TMDB信息 ...")
+        info = self.tmdb.match(name=name,
+                               year=year,
+                               mtype=mtype,
+                               season_year=year,
+                               season_number=season)
+        if info and not info.get("genres"):
+            info = self.tmdb.get_info(mtype=info.get("media_type"),
+                                      tmdbid=info.get("id"))
+        return info
+
    def tmdb_info(self, tmdbid: int, mtype: MediaType) -> Optional[dict]:
        """
        获取TMDB信息