From 71ceb4c186990ab2f9eed436606cca0c1bc3128c Mon Sep 17 00:00:00 2001 From: jxxghp Date: Fri, 9 Jun 2023 17:20:18 +0800 Subject: [PATCH] add playwright && remove feapder --- app/modules/indexer/__init__.py | 29 ++--- app/modules/indexer/playwright_utils.py | 42 +++++++ app/modules/indexer/spider.py | 147 +++++++++++------------- requirements.txt | 4 +- 4 files changed, 115 insertions(+), 107 deletions(-) create mode 100644 app/modules/indexer/playwright_utils.py diff --git a/app/modules/indexer/__init__.py b/app/modules/indexer/__init__.py index 567afb92..066f703b 100644 --- a/app/modules/indexer/__init__.py +++ b/app/modules/indexer/__init__.py @@ -1,4 +1,3 @@ -import time from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime from typing import List, Optional, Tuple, Union @@ -123,7 +122,7 @@ class IndexerModule(_ModuleBase): def __spider_search(indexer: CommentedMap, keyword: str = None, mtype: MediaType = None, - page: int = None, timeout: int = 30) -> (bool, List[dict]): + page: int = None) -> (bool, List[dict]): """ 根据关键字搜索单个站点 :param: indexer: 站点配置 @@ -133,26 +132,12 @@ class IndexerModule(_ModuleBase): :param: timeout: 超时时间 :return: 是否发生错误, 种子列表 """ - _spider = TorrentSpider() - _spider.setparam(indexer=indexer, - mtype=mtype, - keyword=keyword, - page=page) - _spider.start() - # 循环判断是否获取到数据 - sleep_count = 0 - while not _spider.is_complete: - sleep_count += 1 - time.sleep(1) - if sleep_count > timeout: - break - # 是否发生错误 - result_flag = _spider.is_error - # 种子列表 - result_array = _spider.torrents_info_array.copy() - # 重置状态 - _spider.torrents_info_array.clear() - return result_flag, result_array + _spider = TorrentSpider(indexer=indexer, + mtype=mtype, + keyword=keyword, + page=page) + + return _spider.is_error, _spider.get_torrents() def refresh_torrents(self, sites: List[CommentedMap]) -> Optional[List[TorrentInfo]]: """ diff --git a/app/modules/indexer/playwright_utils.py b/app/modules/indexer/playwright_utils.py new file mode 100644 index 00000000..7abfbdb8 --- /dev/null +++ b/app/modules/indexer/playwright_utils.py @@ -0,0 +1,42 @@ +from playwright.sync_api import sync_playwright + + +class PlaywrightUtils: + def __init__(self, browser_type="chromium"): + self.browser_type = browser_type + + def get_page_source(self, url: str, + cookie: str = None, + ua: str = None, + proxy: dict = None, + headless: bool = True): + """ + 获取网页源码 + :param url: 网页地址 + :param cookie: cookie + :param ua: user-agent + :param proxy: 代理 + :param headless: 是否无头模式 + """ + with sync_playwright() as playwright: + browser = playwright[self.browser_type].launch(headless=headless) + context = browser.new_context(user_agent=ua, proxy=proxy) + page = context.new_page() + if cookie: + page.set_extra_http_headers({"cookie": cookie}) + page.goto(url) + page.wait_for_load_state("networkidle") + source = page.content() + browser.close() + + return source + + +# 示例用法 +if __name__ == "__main__": + utils = PlaywrightUtils() + test_url = "https://www.baidu.com" + test_cookies = "cookie1=value1; cookie2=value2" + test_user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36" + source_code = utils.get_page_source(test_url, cookie=test_cookies, ua=test_user_agent) + print(source_code) diff --git a/app/modules/indexer/spider.py b/app/modules/indexer/spider.py index 4d913a5b..ea4c0fc9 100644 --- a/app/modules/indexer/spider.py +++ b/app/modules/indexer/spider.py @@ -1,44 +1,22 @@ import copy import datetime import re -from urllib.parse import quote +from typing import List +from urllib.parse import quote, urlencode -import feapder -from feapder.utils.tools import urlencode from jinja2 import Template from pyquery import PyQuery from ruamel.yaml import CommentedMap from app.core import settings from app.log import logger +from app.modules.indexer.playwright_utils import PlaywrightUtils from app.utils.http import RequestUtils from app.utils.string import StringUtils from app.utils.types import MediaType -class TorrentSpider(feapder.AirSpider): - __custom_setting__ = dict( - SPIDER_THREAD_COUNT=1, - SPIDER_MAX_RETRY_TIMES=0, - REQUEST_LOST_TIMEOUT=10, - RETRY_FAILED_REQUESTS=False, - LOG_LEVEL="ERROR", - RANDOM_HEADERS=False, - WEBDRIVER=dict( - pool_size=1, - load_images=False, - proxy=None, - headless=True, - driver_type="CHROME", - timeout=20, - window_size=(1024, 800), - executable_path=None, - render_time=10, - custom_argument=["--ignore-certificate-errors"], - ) - ) - # 是否搜索完成标志 - is_complete: bool = False +class TorrentSpider: # 是否出现错误 is_error: bool = False # 索引器ID @@ -52,7 +30,7 @@ class TorrentSpider(feapder.AirSpider): # 站点UA ua: str = None # 代理 - proxies: bool = None + proxies: dict = None # 是否渲染 render: bool = False # Referer @@ -82,7 +60,8 @@ class TorrentSpider(feapder.AirSpider): # 种子列表 torrents_info_array: list = [] - def setparam(self, indexer: CommentedMap, + def __init__(self, + indexer: CommentedMap, keyword: [str, list] = None, page=None, referer=None, @@ -124,14 +103,12 @@ class TorrentSpider(feapder.AirSpider): self.referer = referer self.torrents_info_array = [] - def start_requests(self): + def get_torrents(self) -> List[dict]: """ 开始请求 """ - if not self.search or not self.domain: - self.is_complete = True - return + return [] # 种子搜索相对路径 paths = self.search.get('paths', []) @@ -236,20 +213,27 @@ class TorrentSpider(feapder.AirSpider): searchurl = self.domain + str(torrentspath).format(**inputs_dict) logger.info(f"开始请求:{searchurl}") - yield feapder.Request(url=searchurl, - use_session=True, - render=self.render) - def download_midware(self, request): - request.headers = { - "User-Agent": self.ua - } - request.cookies = RequestUtils.cookie_parse(self.cookie) - if self.proxies: - request.proxies = self.proxies - return request + if self.render: + page_source = PlaywrightUtils().get_page_source( + url=searchurl, + cookie=self.cookie, + ua=self.ua, + proxy=self.proxies + ) + else: + page_source = RequestUtils( + ua=self.ua, + cookies=self.cookie, + timeout=30, + referer=self.referer, + proxies=self.proxies + ).get_res(searchurl, allow_redirects=True) - def Gettitle_default(self, torrent): + # 解析 + return self.parse(page_source) + + def __get_title(self, torrent): # title default if 'title' not in self.fields: return @@ -279,7 +263,7 @@ class TorrentSpider(feapder.AirSpider): self.torrents_info['title'] = self.__filter_text(self.torrents_info.get('title'), selector.get('filters')) - def Gettitle_optional(self, torrent): + def __get_description(self, torrent): # title optional if 'description' not in self.fields: return @@ -325,7 +309,7 @@ class TorrentSpider(feapder.AirSpider): self.torrents_info['description'] = self.__filter_text(self.torrents_info.get('description'), selector.get('filters')) - def Getdetails(self, torrent): + def __get_detail(self, torrent): # details if 'details' not in self.fields: return @@ -346,7 +330,7 @@ class TorrentSpider(feapder.AirSpider): else: self.torrents_info['page_url'] = detail_link - def Getdownload(self, torrent): + def __get_download(self, torrent): # download link if 'download' not in self.fields: return @@ -363,7 +347,7 @@ class TorrentSpider(feapder.AirSpider): else: self.torrents_info['enclosure'] = download_link - def Getimdbid(self, torrent): + def __get_imdbid(self, torrent): # imdbid if "imdbid" not in self.fields: return @@ -376,7 +360,7 @@ class TorrentSpider(feapder.AirSpider): self.torrents_info['imdbid'] = self.__filter_text(self.torrents_info.get('imdbid'), selector.get('filters')) - def Getsize(self, torrent): + def __get_size(self, torrent): # torrent size if 'size' not in self.fields: return @@ -391,7 +375,7 @@ class TorrentSpider(feapder.AirSpider): selector.get('filters')) self.torrents_info['size'] = StringUtils.num_filesize(self.torrents_info.get('size')) - def Getleechers(self, torrent): + def __get_leechers(self, torrent): # torrent leechers if 'leechers' not in self.fields: return @@ -407,7 +391,7 @@ class TorrentSpider(feapder.AirSpider): else: self.torrents_info['peers'] = 0 - def Getseeders(self, torrent): + def __get_seeders(self, torrent): # torrent leechers if 'seeders' not in self.fields: return @@ -423,7 +407,7 @@ class TorrentSpider(feapder.AirSpider): else: self.torrents_info['seeders'] = 0 - def Getgrabs(self, torrent): + def __get_grabs(self, torrent): # torrent grabs if 'grabs' not in self.fields: return @@ -439,7 +423,7 @@ class TorrentSpider(feapder.AirSpider): else: self.torrents_info['grabs'] = 0 - def Getpubdate(self, torrent): + def __get_pubdate(self, torrent): # torrent pubdate if 'date_added' not in self.fields: return @@ -451,7 +435,7 @@ class TorrentSpider(feapder.AirSpider): self.torrents_info['pubdate'] = self.__filter_text(self.torrents_info.get('pubdate'), selector.get('filters')) - def Getelapsed_date(self, torrent): + def __get_date_elapsed(self, torrent): # torrent pubdate if 'date_elapsed' not in self.fields: return @@ -463,7 +447,7 @@ class TorrentSpider(feapder.AirSpider): self.torrents_info['date_elapsed'] = self.__filter_text(self.torrents_info.get('date_elapsed'), selector.get('filters')) - def Getdownloadvolumefactor(self, torrent): + def __get_downloadvolumefactor(self, torrent): # downloadvolumefactor selector = self.fields.get('downloadvolumefactor', {}) if not selector: @@ -486,7 +470,7 @@ class TorrentSpider(feapder.AirSpider): if downloadvolumefactor: self.torrents_info['downloadvolumefactor'] = int(downloadvolumefactor.group(1)) - def Getuploadvolumefactor(self, torrent): + def __get_uploadvolumefactor(self, torrent): # uploadvolumefactor selector = self.fields.get('uploadvolumefactor', {}) if not selector: @@ -509,7 +493,7 @@ class TorrentSpider(feapder.AirSpider): if uploadvolumefactor: self.torrents_info['uploadvolumefactor'] = int(uploadvolumefactor.group(1)) - def Getlabels(self, torrent): + def __get_labels(self, torrent): # labels if 'labels' not in self.fields: return @@ -520,26 +504,26 @@ class TorrentSpider(feapder.AirSpider): if items: self.torrents_info['labels'] = items - def Getinfo(self, torrent): + def get_info(self, torrent) -> dict: """ 解析单条种子数据 """ self.torrents_info = {'indexer': self.indexerid} try: - self.Gettitle_default(torrent) - self.Gettitle_optional(torrent) - self.Getdetails(torrent) - self.Getdownload(torrent) - self.Getgrabs(torrent) - self.Getleechers(torrent) - self.Getseeders(torrent) - self.Getsize(torrent) - self.Getimdbid(torrent) - self.Getdownloadvolumefactor(torrent) - self.Getuploadvolumefactor(torrent) - self.Getpubdate(torrent) - self.Getelapsed_date(torrent) - self.Getlabels(torrent) + self.__get_title(torrent) + self.__get_description(torrent) + self.__get_detail(torrent) + self.__get_download(torrent) + self.__get_grabs(torrent) + self.__get_leechers(torrent) + self.__get_seeders(torrent) + self.__get_size(torrent) + self.__get_imdbid(torrent) + self.__get_downloadvolumefactor(torrent) + self.__get_uploadvolumefactor(torrent) + self.__get_pubdate(torrent) + self.__get_date_elapsed(torrent) + self.__get_labels(torrent) except Exception as err: logger.error("%s 搜索出现错误:%s" % (self.indexername, str(err))) return self.torrents_info @@ -613,29 +597,26 @@ class TorrentSpider(feapder.AirSpider): items = items[0] return items - def parse(self, request, response): + def parse(self, html_text: str) -> List[dict]: """ 解析整个页面 """ + if not html_text: + self.is_error = True + return [] + # 清空旧结果 + self.torrents_info_array = [] try: - # 获取站点文本 - html_text = response.extract() - if not html_text: - self.is_error = True - self.is_complete = True - return # 解析站点文本对象 html_doc = PyQuery(html_text) # 种子筛选器 torrents_selector = self.list.get('selector', '') # 遍历种子html列表 for torn in html_doc(torrents_selector): - self.torrents_info_array.append(copy.deepcopy(self.Getinfo(PyQuery(torn)))) + self.torrents_info_array.append(copy.deepcopy(self.get_info(PyQuery(torn)))) if len(self.torrents_info_array) >= int(self.result_num): break - + return self.torrents_info_array except Exception as err: self.is_error = True logger.warn(f"错误:{self.indexername} {err}") - finally: - self.is_complete = True diff --git a/requirements.txt b/requirements.txt index 3389d3b9..b246ef1e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,7 +28,6 @@ pycryptodome~=3.18.0 qbittorrent-api==2023.5.48 plexapi~=4.14.0 transmission-rpc~=4.3.0 -feapder~=1.8.5 Jinja2~=3.1.2 pyparsing~=3.0.9 func_timeout==4.3.5 @@ -36,4 +35,5 @@ selenium~=4.9.1 bs4~=0.0.1 beautifulsoup4~=4.12.2 pillow~=9.5.0 -pyTelegramBotAPI~=4.12.0 \ No newline at end of file +pyTelegramBotAPI~=4.12.0 +playwright~=1.34.0 \ No newline at end of file