diff --git a/app/modules/indexer/spider.py b/app/modules/indexer/spider.py index 2dc91ce0..2a15549f 100644 --- a/app/modules/indexer/spider.py +++ b/app/modules/indexer/spider.py @@ -4,6 +4,7 @@ import re from typing import List from urllib.parse import quote, urlencode +import chardet from jinja2 import Template from pyquery import PyQuery from ruamel.yaml import CommentedMap @@ -216,6 +217,7 @@ class TorrentSpider: logger.info(f"开始请求:{searchurl}") if self.render: + # 浏览器仿真 page_source = PlaywrightHelper().get_page_source( url=searchurl, cookies=self.cookie, @@ -223,6 +225,7 @@ class TorrentSpider: proxy=self.proxies ) else: + # requests请求 ret = RequestUtils( ua=self.ua, cookies=self.cookie, @@ -230,8 +233,16 @@ class TorrentSpider: referer=self.referer, proxies=self.proxies ).get_res(searchurl, allow_redirects=True) - - page_source = ret.text if ret else None + + # 使用chardet检测字符编码 + raw_data = ret.content + if raw_data: + result = chardet.detect(raw_data) + encoding = result['encoding'] + # 解码为字符串 + page_source = raw_data.decode(encoding) + else: + page_source = "" # 解析 return self.parse(page_source) diff --git a/app/utils/http.py b/app/utils/http.py index 0d0db8ff..ad9b5393 100644 --- a/app/utils/http.py +++ b/app/utils/http.py @@ -1,8 +1,8 @@ -from typing import Union, Any +from typing import Union, Any, Optional import requests import urllib3 -from requests import Session +from requests import Session, Response from urllib3.exceptions import InsecureRequestWarning urllib3.disable_warnings(InsecureRequestWarning) @@ -48,7 +48,7 @@ class RequestUtils: if timeout: self._timeout = timeout - def post(self, url: str, data: Any = None, json: dict = None): + def post(self, url: str, data: Any = None, json: dict = None) -> Optional[Response]: if json is None: json = {} try: @@ -71,7 +71,7 @@ class RequestUtils: except requests.exceptions.RequestException: return None - def get(self, url: str, params: dict = None): + def get(self, url: str, params: dict = None) -> Optional[str]: try: if self._session: r = self._session.get(url, @@ -91,7 +91,8 @@ class RequestUtils: except requests.exceptions.RequestException: return None - def get_res(self, url: str, params: dict = None, allow_redirects: bool = True, raise_exception: bool = False): + def get_res(self, url: str, params: dict = None, + allow_redirects: bool = True, raise_exception: bool = False) -> Optional[Response]: try: if self._session: return self._session.get(url, @@ -116,9 +117,10 @@ class RequestUtils: raise requests.exceptions.RequestException return None - def post_res(self, url: str, data: Any = None, params: dict = None, allow_redirects: bool = True, + def post_res(self, url: str, data: Any = None, params: dict = None, + allow_redirects: bool = True, files: Any = None, - json: dict = None): + json: dict = None) -> Optional[Response]: try: if self._session: return self._session.post(url, @@ -148,7 +150,7 @@ class RequestUtils: return None @staticmethod - def cookie_parse(cookies_str: str, array: bool = False): + def cookie_parse(cookies_str: str, array: bool = False) -> dict: """ 解析cookie,转化为字典或者数组 :param cookies_str: cookie字符串 diff --git a/requirements.txt b/requirements.txt index 8aa63050..cd955986 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,3 +38,4 @@ cf_clearance~=0.29.2 torrentool~=1.2.0 slack_bolt~=1.18.0 slack_sdk~=3.21.3 +chardet~=4.0.0 \ No newline at end of file