add playwright && remove feapder

This commit is contained in:
jxxghp 2023-06-09 17:20:18 +08:00
parent a95b5f5c8e
commit 71ceb4c186
4 changed files with 115 additions and 107 deletions

View File

@ -1,4 +1,3 @@
import time
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime from datetime import datetime
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
@ -123,7 +122,7 @@ class IndexerModule(_ModuleBase):
def __spider_search(indexer: CommentedMap, def __spider_search(indexer: CommentedMap,
keyword: str = None, keyword: str = None,
mtype: MediaType = None, mtype: MediaType = None,
page: int = None, timeout: int = 30) -> (bool, List[dict]): page: int = None) -> (bool, List[dict]):
""" """
根据关键字搜索单个站点 根据关键字搜索单个站点
:param: indexer: 站点配置 :param: indexer: 站点配置
@ -133,26 +132,12 @@ class IndexerModule(_ModuleBase):
:param: timeout: 超时时间 :param: timeout: 超时时间
:return: 是否发生错误, 种子列表 :return: 是否发生错误, 种子列表
""" """
_spider = TorrentSpider() _spider = TorrentSpider(indexer=indexer,
_spider.setparam(indexer=indexer, mtype=mtype,
mtype=mtype, keyword=keyword,
keyword=keyword, page=page)
page=page)
_spider.start() return _spider.is_error, _spider.get_torrents()
# 循环判断是否获取到数据
sleep_count = 0
while not _spider.is_complete:
sleep_count += 1
time.sleep(1)
if sleep_count > timeout:
break
# 是否发生错误
result_flag = _spider.is_error
# 种子列表
result_array = _spider.torrents_info_array.copy()
# 重置状态
_spider.torrents_info_array.clear()
return result_flag, result_array
def refresh_torrents(self, sites: List[CommentedMap]) -> Optional[List[TorrentInfo]]: def refresh_torrents(self, sites: List[CommentedMap]) -> Optional[List[TorrentInfo]]:
""" """

View File

@ -0,0 +1,42 @@
from playwright.sync_api import sync_playwright
class PlaywrightUtils:
def __init__(self, browser_type="chromium"):
self.browser_type = browser_type
def get_page_source(self, url: str,
cookie: str = None,
ua: str = None,
proxy: dict = None,
headless: bool = True):
"""
获取网页源码
:param url: 网页地址
:param cookie: cookie
:param ua: user-agent
:param proxy: 代理
:param headless: 是否无头模式
"""
with sync_playwright() as playwright:
browser = playwright[self.browser_type].launch(headless=headless)
context = browser.new_context(user_agent=ua, proxy=proxy)
page = context.new_page()
if cookie:
page.set_extra_http_headers({"cookie": cookie})
page.goto(url)
page.wait_for_load_state("networkidle")
source = page.content()
browser.close()
return source
# 示例用法
if __name__ == "__main__":
utils = PlaywrightUtils()
test_url = "https://www.baidu.com"
test_cookies = "cookie1=value1; cookie2=value2"
test_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
source_code = utils.get_page_source(test_url, cookie=test_cookies, ua=test_user_agent)
print(source_code)

View File

@ -1,44 +1,22 @@
import copy import copy
import datetime import datetime
import re import re
from urllib.parse import quote from typing import List
from urllib.parse import quote, urlencode
import feapder
from feapder.utils.tools import urlencode
from jinja2 import Template from jinja2 import Template
from pyquery import PyQuery from pyquery import PyQuery
from ruamel.yaml import CommentedMap from ruamel.yaml import CommentedMap
from app.core import settings from app.core import settings
from app.log import logger from app.log import logger
from app.modules.indexer.playwright_utils import PlaywrightUtils
from app.utils.http import RequestUtils from app.utils.http import RequestUtils
from app.utils.string import StringUtils from app.utils.string import StringUtils
from app.utils.types import MediaType from app.utils.types import MediaType
class TorrentSpider(feapder.AirSpider): class TorrentSpider:
__custom_setting__ = dict(
SPIDER_THREAD_COUNT=1,
SPIDER_MAX_RETRY_TIMES=0,
REQUEST_LOST_TIMEOUT=10,
RETRY_FAILED_REQUESTS=False,
LOG_LEVEL="ERROR",
RANDOM_HEADERS=False,
WEBDRIVER=dict(
pool_size=1,
load_images=False,
proxy=None,
headless=True,
driver_type="CHROME",
timeout=20,
window_size=(1024, 800),
executable_path=None,
render_time=10,
custom_argument=["--ignore-certificate-errors"],
)
)
# 是否搜索完成标志
is_complete: bool = False
# 是否出现错误 # 是否出现错误
is_error: bool = False is_error: bool = False
# 索引器ID # 索引器ID
@ -52,7 +30,7 @@ class TorrentSpider(feapder.AirSpider):
# 站点UA # 站点UA
ua: str = None ua: str = None
# 代理 # 代理
proxies: bool = None proxies: dict = None
# 是否渲染 # 是否渲染
render: bool = False render: bool = False
# Referer # Referer
@ -82,7 +60,8 @@ class TorrentSpider(feapder.AirSpider):
# 种子列表 # 种子列表
torrents_info_array: list = [] torrents_info_array: list = []
def setparam(self, indexer: CommentedMap, def __init__(self,
indexer: CommentedMap,
keyword: [str, list] = None, keyword: [str, list] = None,
page=None, page=None,
referer=None, referer=None,
@ -124,14 +103,12 @@ class TorrentSpider(feapder.AirSpider):
self.referer = referer self.referer = referer
self.torrents_info_array = [] self.torrents_info_array = []
def start_requests(self): def get_torrents(self) -> List[dict]:
""" """
开始请求 开始请求
""" """
if not self.search or not self.domain: if not self.search or not self.domain:
self.is_complete = True return []
return
# 种子搜索相对路径 # 种子搜索相对路径
paths = self.search.get('paths', []) paths = self.search.get('paths', [])
@ -236,20 +213,27 @@ class TorrentSpider(feapder.AirSpider):
searchurl = self.domain + str(torrentspath).format(**inputs_dict) searchurl = self.domain + str(torrentspath).format(**inputs_dict)
logger.info(f"开始请求:{searchurl}") logger.info(f"开始请求:{searchurl}")
yield feapder.Request(url=searchurl,
use_session=True,
render=self.render)
def download_midware(self, request): if self.render:
request.headers = { page_source = PlaywrightUtils().get_page_source(
"User-Agent": self.ua url=searchurl,
} cookie=self.cookie,
request.cookies = RequestUtils.cookie_parse(self.cookie) ua=self.ua,
if self.proxies: proxy=self.proxies
request.proxies = self.proxies )
return request else:
page_source = RequestUtils(
ua=self.ua,
cookies=self.cookie,
timeout=30,
referer=self.referer,
proxies=self.proxies
).get_res(searchurl, allow_redirects=True)
def Gettitle_default(self, torrent): # 解析
return self.parse(page_source)
def __get_title(self, torrent):
# title default # title default
if 'title' not in self.fields: if 'title' not in self.fields:
return return
@ -279,7 +263,7 @@ class TorrentSpider(feapder.AirSpider):
self.torrents_info['title'] = self.__filter_text(self.torrents_info.get('title'), self.torrents_info['title'] = self.__filter_text(self.torrents_info.get('title'),
selector.get('filters')) selector.get('filters'))
def Gettitle_optional(self, torrent): def __get_description(self, torrent):
# title optional # title optional
if 'description' not in self.fields: if 'description' not in self.fields:
return return
@ -325,7 +309,7 @@ class TorrentSpider(feapder.AirSpider):
self.torrents_info['description'] = self.__filter_text(self.torrents_info.get('description'), self.torrents_info['description'] = self.__filter_text(self.torrents_info.get('description'),
selector.get('filters')) selector.get('filters'))
def Getdetails(self, torrent): def __get_detail(self, torrent):
# details # details
if 'details' not in self.fields: if 'details' not in self.fields:
return return
@ -346,7 +330,7 @@ class TorrentSpider(feapder.AirSpider):
else: else:
self.torrents_info['page_url'] = detail_link self.torrents_info['page_url'] = detail_link
def Getdownload(self, torrent): def __get_download(self, torrent):
# download link # download link
if 'download' not in self.fields: if 'download' not in self.fields:
return return
@ -363,7 +347,7 @@ class TorrentSpider(feapder.AirSpider):
else: else:
self.torrents_info['enclosure'] = download_link self.torrents_info['enclosure'] = download_link
def Getimdbid(self, torrent): def __get_imdbid(self, torrent):
# imdbid # imdbid
if "imdbid" not in self.fields: if "imdbid" not in self.fields:
return return
@ -376,7 +360,7 @@ class TorrentSpider(feapder.AirSpider):
self.torrents_info['imdbid'] = self.__filter_text(self.torrents_info.get('imdbid'), self.torrents_info['imdbid'] = self.__filter_text(self.torrents_info.get('imdbid'),
selector.get('filters')) selector.get('filters'))
def Getsize(self, torrent): def __get_size(self, torrent):
# torrent size # torrent size
if 'size' not in self.fields: if 'size' not in self.fields:
return return
@ -391,7 +375,7 @@ class TorrentSpider(feapder.AirSpider):
selector.get('filters')) selector.get('filters'))
self.torrents_info['size'] = StringUtils.num_filesize(self.torrents_info.get('size')) self.torrents_info['size'] = StringUtils.num_filesize(self.torrents_info.get('size'))
def Getleechers(self, torrent): def __get_leechers(self, torrent):
# torrent leechers # torrent leechers
if 'leechers' not in self.fields: if 'leechers' not in self.fields:
return return
@ -407,7 +391,7 @@ class TorrentSpider(feapder.AirSpider):
else: else:
self.torrents_info['peers'] = 0 self.torrents_info['peers'] = 0
def Getseeders(self, torrent): def __get_seeders(self, torrent):
# torrent leechers # torrent leechers
if 'seeders' not in self.fields: if 'seeders' not in self.fields:
return return
@ -423,7 +407,7 @@ class TorrentSpider(feapder.AirSpider):
else: else:
self.torrents_info['seeders'] = 0 self.torrents_info['seeders'] = 0
def Getgrabs(self, torrent): def __get_grabs(self, torrent):
# torrent grabs # torrent grabs
if 'grabs' not in self.fields: if 'grabs' not in self.fields:
return return
@ -439,7 +423,7 @@ class TorrentSpider(feapder.AirSpider):
else: else:
self.torrents_info['grabs'] = 0 self.torrents_info['grabs'] = 0
def Getpubdate(self, torrent): def __get_pubdate(self, torrent):
# torrent pubdate # torrent pubdate
if 'date_added' not in self.fields: if 'date_added' not in self.fields:
return return
@ -451,7 +435,7 @@ class TorrentSpider(feapder.AirSpider):
self.torrents_info['pubdate'] = self.__filter_text(self.torrents_info.get('pubdate'), self.torrents_info['pubdate'] = self.__filter_text(self.torrents_info.get('pubdate'),
selector.get('filters')) selector.get('filters'))
def Getelapsed_date(self, torrent): def __get_date_elapsed(self, torrent):
# torrent pubdate # torrent pubdate
if 'date_elapsed' not in self.fields: if 'date_elapsed' not in self.fields:
return return
@ -463,7 +447,7 @@ class TorrentSpider(feapder.AirSpider):
self.torrents_info['date_elapsed'] = self.__filter_text(self.torrents_info.get('date_elapsed'), self.torrents_info['date_elapsed'] = self.__filter_text(self.torrents_info.get('date_elapsed'),
selector.get('filters')) selector.get('filters'))
def Getdownloadvolumefactor(self, torrent): def __get_downloadvolumefactor(self, torrent):
# downloadvolumefactor # downloadvolumefactor
selector = self.fields.get('downloadvolumefactor', {}) selector = self.fields.get('downloadvolumefactor', {})
if not selector: if not selector:
@ -486,7 +470,7 @@ class TorrentSpider(feapder.AirSpider):
if downloadvolumefactor: if downloadvolumefactor:
self.torrents_info['downloadvolumefactor'] = int(downloadvolumefactor.group(1)) self.torrents_info['downloadvolumefactor'] = int(downloadvolumefactor.group(1))
def Getuploadvolumefactor(self, torrent): def __get_uploadvolumefactor(self, torrent):
# uploadvolumefactor # uploadvolumefactor
selector = self.fields.get('uploadvolumefactor', {}) selector = self.fields.get('uploadvolumefactor', {})
if not selector: if not selector:
@ -509,7 +493,7 @@ class TorrentSpider(feapder.AirSpider):
if uploadvolumefactor: if uploadvolumefactor:
self.torrents_info['uploadvolumefactor'] = int(uploadvolumefactor.group(1)) self.torrents_info['uploadvolumefactor'] = int(uploadvolumefactor.group(1))
def Getlabels(self, torrent): def __get_labels(self, torrent):
# labels # labels
if 'labels' not in self.fields: if 'labels' not in self.fields:
return return
@ -520,26 +504,26 @@ class TorrentSpider(feapder.AirSpider):
if items: if items:
self.torrents_info['labels'] = items self.torrents_info['labels'] = items
def Getinfo(self, torrent): def get_info(self, torrent) -> dict:
""" """
解析单条种子数据 解析单条种子数据
""" """
self.torrents_info = {'indexer': self.indexerid} self.torrents_info = {'indexer': self.indexerid}
try: try:
self.Gettitle_default(torrent) self.__get_title(torrent)
self.Gettitle_optional(torrent) self.__get_description(torrent)
self.Getdetails(torrent) self.__get_detail(torrent)
self.Getdownload(torrent) self.__get_download(torrent)
self.Getgrabs(torrent) self.__get_grabs(torrent)
self.Getleechers(torrent) self.__get_leechers(torrent)
self.Getseeders(torrent) self.__get_seeders(torrent)
self.Getsize(torrent) self.__get_size(torrent)
self.Getimdbid(torrent) self.__get_imdbid(torrent)
self.Getdownloadvolumefactor(torrent) self.__get_downloadvolumefactor(torrent)
self.Getuploadvolumefactor(torrent) self.__get_uploadvolumefactor(torrent)
self.Getpubdate(torrent) self.__get_pubdate(torrent)
self.Getelapsed_date(torrent) self.__get_date_elapsed(torrent)
self.Getlabels(torrent) self.__get_labels(torrent)
except Exception as err: except Exception as err:
logger.error("%s 搜索出现错误:%s" % (self.indexername, str(err))) logger.error("%s 搜索出现错误:%s" % (self.indexername, str(err)))
return self.torrents_info return self.torrents_info
@ -613,29 +597,26 @@ class TorrentSpider(feapder.AirSpider):
items = items[0] items = items[0]
return items return items
def parse(self, request, response): def parse(self, html_text: str) -> List[dict]:
""" """
解析整个页面 解析整个页面
""" """
if not html_text:
self.is_error = True
return []
# 清空旧结果
self.torrents_info_array = []
try: try:
# 获取站点文本
html_text = response.extract()
if not html_text:
self.is_error = True
self.is_complete = True
return
# 解析站点文本对象 # 解析站点文本对象
html_doc = PyQuery(html_text) html_doc = PyQuery(html_text)
# 种子筛选器 # 种子筛选器
torrents_selector = self.list.get('selector', '') torrents_selector = self.list.get('selector', '')
# 遍历种子html列表 # 遍历种子html列表
for torn in html_doc(torrents_selector): for torn in html_doc(torrents_selector):
self.torrents_info_array.append(copy.deepcopy(self.Getinfo(PyQuery(torn)))) self.torrents_info_array.append(copy.deepcopy(self.get_info(PyQuery(torn))))
if len(self.torrents_info_array) >= int(self.result_num): if len(self.torrents_info_array) >= int(self.result_num):
break break
return self.torrents_info_array
except Exception as err: except Exception as err:
self.is_error = True self.is_error = True
logger.warn(f"错误:{self.indexername} {err}") logger.warn(f"错误:{self.indexername} {err}")
finally:
self.is_complete = True

View File

@ -28,7 +28,6 @@ pycryptodome~=3.18.0
qbittorrent-api==2023.5.48 qbittorrent-api==2023.5.48
plexapi~=4.14.0 plexapi~=4.14.0
transmission-rpc~=4.3.0 transmission-rpc~=4.3.0
feapder~=1.8.5
Jinja2~=3.1.2 Jinja2~=3.1.2
pyparsing~=3.0.9 pyparsing~=3.0.9
func_timeout==4.3.5 func_timeout==4.3.5
@ -36,4 +35,5 @@ selenium~=4.9.1
bs4~=0.0.1 bs4~=0.0.1
beautifulsoup4~=4.12.2 beautifulsoup4~=4.12.2
pillow~=9.5.0 pillow~=9.5.0
pyTelegramBotAPI~=4.12.0 pyTelegramBotAPI~=4.12.0
playwright~=1.34.0