From ec8c9c996a5b6495b13bc73957ece62adede3095 Mon Sep 17 00:00:00 2001 From: jxxghp Date: Sat, 2 Sep 2023 07:57:44 +0800 Subject: [PATCH] =?UTF-8?q?fix=20#356=20=E7=8C=AB=E7=AB=99=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E7=BB=9F=E8=AE=A1=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/modules/indexer/spider.py | 7 ++++++- app/plugins/sitestatistic/__init__.py | 9 +++++---- app/plugins/sitestatistic/siteuserinfo/__init__.py | 4 ++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/app/modules/indexer/spider.py b/app/modules/indexer/spider.py index ee1b2ea9..0b4ae684 100644 --- a/app/modules/indexer/spider.py +++ b/app/modules/indexer/spider.py @@ -262,7 +262,12 @@ class TorrentSpider: # 解码为字符串 page_source = raw_data.decode(encoding) except Exception as e: - logger.error(f"chardet解码失败:{e}") + logger.debug(f"chardet解码失败:{e}") + # 探测utf-8解码 + if re.search(r"charset=\"?utf-8\"?", ret.text, re.IGNORECASE): + ret.encoding = "utf-8" + else: + ret.encoding = ret.apparent_encoding page_source = ret.text else: page_source = ret.text diff --git a/app/plugins/sitestatistic/__init__.py b/app/plugins/sitestatistic/__init__.py index 5d7426de..5ff3b870 100644 --- a/app/plugins/sitestatistic/__init__.py +++ b/app/plugins/sitestatistic/__init__.py @@ -1,3 +1,4 @@ +import re import warnings from datetime import datetime, timedelta from multiprocessing.dummy import Pool as ThreadPool @@ -853,8 +854,8 @@ class SiteStatistic(_PluginBase): proxies=proxies ).get_res(url=url) if res and res.status_code == 200: - if "charset=utf-8" in res.text or "charset=UTF-8" in res.text: - res.encoding = "UTF-8" + if re.search(r"charset=\"?utf-8\"?", res.text, re.IGNORECASE): + res.encoding = "utf-8" else: res.encoding = res.apparent_encoding html_text = res.text @@ -893,8 +894,8 @@ class SiteStatistic(_PluginBase): proxies=proxies ).get_res(url=url + "/index.php") if res and res.status_code == 200: - if "charset=utf-8" in res.text or "charset=UTF-8" in res.text: - res.encoding = "UTF-8" + if re.search(r"charset=\"?utf-8\"?", res.text, re.IGNORECASE): + res.encoding = "utf-8" else: res.encoding = res.apparent_encoding html_text = res.text diff --git a/app/plugins/sitestatistic/siteuserinfo/__init__.py b/app/plugins/sitestatistic/siteuserinfo/__init__.py index ada4c657..fbeff1ae 100644 --- a/app/plugins/sitestatistic/siteuserinfo/__init__.py +++ b/app/plugins/sitestatistic/siteuserinfo/__init__.py @@ -247,8 +247,8 @@ class ISiteUserInfo(metaclass=ABCMeta): logger.warn( f"{self.site_name} 检测到Cloudflare,请更新Cookie和UA") return "" - if "charset=utf-8" in res.text or "charset=UTF-8" in res.text: - res.encoding = "UTF-8" + if re.search(r"charset=\"?utf-8\"?", res.text, re.IGNORECASE): + res.encoding = "utf-8" else: res.encoding = res.apparent_encoding return res.text