fix 索引乱码问题

This commit is contained in:
jxxghp 2023-06-11 18:58:29 +08:00
parent db4b9fcef8
commit 1e82ecc16c
3 changed files with 24 additions and 10 deletions

View File

@ -4,6 +4,7 @@ import re
from typing import List from typing import List
from urllib.parse import quote, urlencode from urllib.parse import quote, urlencode
import chardet
from jinja2 import Template from jinja2 import Template
from pyquery import PyQuery from pyquery import PyQuery
from ruamel.yaml import CommentedMap from ruamel.yaml import CommentedMap
@ -216,6 +217,7 @@ class TorrentSpider:
logger.info(f"开始请求:{searchurl}") logger.info(f"开始请求:{searchurl}")
if self.render: if self.render:
# 浏览器仿真
page_source = PlaywrightHelper().get_page_source( page_source = PlaywrightHelper().get_page_source(
url=searchurl, url=searchurl,
cookies=self.cookie, cookies=self.cookie,
@ -223,6 +225,7 @@ class TorrentSpider:
proxy=self.proxies proxy=self.proxies
) )
else: else:
# requests请求
ret = RequestUtils( ret = RequestUtils(
ua=self.ua, ua=self.ua,
cookies=self.cookie, cookies=self.cookie,
@ -230,8 +233,16 @@ class TorrentSpider:
referer=self.referer, referer=self.referer,
proxies=self.proxies proxies=self.proxies
).get_res(searchurl, allow_redirects=True) ).get_res(searchurl, allow_redirects=True)
page_source = ret.text if ret else None # 使用chardet检测字符编码
raw_data = ret.content
if raw_data:
result = chardet.detect(raw_data)
encoding = result['encoding']
# 解码为字符串
page_source = raw_data.decode(encoding)
else:
page_source = ""
# 解析 # 解析
return self.parse(page_source) return self.parse(page_source)

View File

@ -1,8 +1,8 @@
from typing import Union, Any from typing import Union, Any, Optional
import requests import requests
import urllib3 import urllib3
from requests import Session from requests import Session, Response
from urllib3.exceptions import InsecureRequestWarning from urllib3.exceptions import InsecureRequestWarning
urllib3.disable_warnings(InsecureRequestWarning) urllib3.disable_warnings(InsecureRequestWarning)
@ -48,7 +48,7 @@ class RequestUtils:
if timeout: if timeout:
self._timeout = timeout self._timeout = timeout
def post(self, url: str, data: Any = None, json: dict = None): def post(self, url: str, data: Any = None, json: dict = None) -> Optional[Response]:
if json is None: if json is None:
json = {} json = {}
try: try:
@ -71,7 +71,7 @@ class RequestUtils:
except requests.exceptions.RequestException: except requests.exceptions.RequestException:
return None return None
def get(self, url: str, params: dict = None): def get(self, url: str, params: dict = None) -> Optional[str]:
try: try:
if self._session: if self._session:
r = self._session.get(url, r = self._session.get(url,
@ -91,7 +91,8 @@ class RequestUtils:
except requests.exceptions.RequestException: except requests.exceptions.RequestException:
return None return None
def get_res(self, url: str, params: dict = None, allow_redirects: bool = True, raise_exception: bool = False): def get_res(self, url: str, params: dict = None,
allow_redirects: bool = True, raise_exception: bool = False) -> Optional[Response]:
try: try:
if self._session: if self._session:
return self._session.get(url, return self._session.get(url,
@ -116,9 +117,10 @@ class RequestUtils:
raise requests.exceptions.RequestException raise requests.exceptions.RequestException
return None return None
def post_res(self, url: str, data: Any = None, params: dict = None, allow_redirects: bool = True, def post_res(self, url: str, data: Any = None, params: dict = None,
allow_redirects: bool = True,
files: Any = None, files: Any = None,
json: dict = None): json: dict = None) -> Optional[Response]:
try: try:
if self._session: if self._session:
return self._session.post(url, return self._session.post(url,
@ -148,7 +150,7 @@ class RequestUtils:
return None return None
@staticmethod @staticmethod
def cookie_parse(cookies_str: str, array: bool = False): def cookie_parse(cookies_str: str, array: bool = False) -> dict:
""" """
解析cookie转化为字典或者数组 解析cookie转化为字典或者数组
:param cookies_str: cookie字符串 :param cookies_str: cookie字符串

View File

@ -38,3 +38,4 @@ cf_clearance~=0.29.2
torrentool~=1.2.0 torrentool~=1.2.0
slack_bolt~=1.18.0 slack_bolt~=1.18.0
slack_sdk~=3.21.3 slack_sdk~=3.21.3
chardet~=4.0.0