From 08aef1f47f5366e84cc2bf677fb9884a999dc79e Mon Sep 17 00:00:00 2001 From: jxxghp Date: Mon, 11 Sep 2023 17:13:26 +0800 Subject: [PATCH] fix rsslink helper --- app/chain/cookiecloud.py | 242 +++-------------------------------- app/helper/rss.py | 264 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 281 insertions(+), 225 deletions(-) diff --git a/app/chain/cookiecloud.py b/app/chain/cookiecloud.py index e06d5eee..fee6d627 100644 --- a/app/chain/cookiecloud.py +++ b/app/chain/cookiecloud.py @@ -10,10 +10,10 @@ from app.chain.site import SiteChain from app.core.config import settings from app.db.site_oper import SiteOper from app.db.siteicon_oper import SiteIconOper -from app.helper.browser import PlaywrightHelper from app.helper.cloudflare import under_challenge from app.helper.cookiecloud import CookieCloudHelper from app.helper.message import MessageHelper +from app.helper.rss import RssHelper from app.helper.sites import SitesHelper from app.log import logger from app.schemas import Notification, NotificationType, MessageChannel @@ -31,6 +31,7 @@ class CookieCloudChain(ChainBase): self.siteoper = SiteOper(self._db) self.siteiconoper = SiteIconOper(self._db) self.siteshelper = SitesHelper() + self.rsshelper = RssHelper() self.sitechain = SiteChain(self._db) self.message = MessageHelper() self.cookiecloud = CookieCloudHelper( @@ -82,10 +83,17 @@ class CookieCloudChain(ChainBase): logger.info(f"站点【{site_info.name}】连通性正常,不同步CookieCloud数据") if not site_info.public and not site_info.rss: # 自动生成rss地址 - rss_url = self.__get_rss(url=site_info.url, cookie=cookie, ua=settings.USER_AGENT, - proxy=site_info.proxy) - # 更新站点rss地址 - self.siteoper.update_rss(domain=domain, rss=rss_url) + rss_url, errmsg = self.rsshelper.get_rss_link( + url=site_info.url, + cookie=cookie, + ua=settings.USER_AGENT, + proxy=True if site_info.proxy else False + ) + if rss_url: + # 更新站点rss地址 + self.siteoper.update_rss(domain=domain, rss=rss_url) + else: + logger.warn(errmsg) continue # 更新站点Cookie self.siteoper.update_cookie(domain=domain, cookies=cookie) @@ -115,7 +123,12 @@ class CookieCloudChain(ChainBase): rss_url = None if not indexer.get("public") and indexer.get("domain"): # 自动生成rss地址 - rss_url = self.__get_rss(url=indexer.get("domain"), cookie=cookie, ua=settings.USER_AGENT) + rss_url, errmsg = self.rsshelper.get_rss_link(url=indexer.get("domain"), + cookie=cookie, + ua=settings.USER_AGENT) + if errmsg: + logger.warn(errmsg) + # 插入数据库 self.siteoper.add(name=indexer.get("name"), url=indexer.get("domain"), domain=domain, @@ -148,223 +161,6 @@ class CookieCloudChain(ChainBase): logger.info(f"CookieCloud同步成功:{ret_msg}") return True, ret_msg - def __get_rss(self, url: str, cookie: str, ua: str, proxy: int) -> str: - """ - 获取站点rss地址 - """ - if "ourbits.club" in url: - return self.__get_rss_ourbits(url=url, cookie=cookie, ua=ua, proxy=proxy) - if "totheglory.im" in url: - return self.__get_rss_ttg(url=url, cookie=cookie, ua=ua, proxy=proxy) - if "monikadesign.uk" in url: - return self.__get_rss_monika(url=url, cookie=cookie, ua=ua, proxy=proxy) - if "zhuque.in" in url: - return self.__get_rss_zhuque(url=url, cookie=cookie, ua=ua, proxy=proxy) - - xpath = "//a[@class='faqlink']/@href" - if "club.hares.top" in url: - xpath = "//*[@id='layui-layer100001']/div[2]/div/p[4]/a/@href" - if "et8.org" in url: - xpath = "//*[@id='outer']/table/tbody/tr/td/table/tbody/tr/td/a[2]/@href" - if "pttime.org" in url: - xpath = "//*[@id='outer']/table/tbody/tr/td/table/tbody/tr/td/text()[5]" - - return self.__get_rss_base(url=url, cookie=cookie, ua=ua, xpath=xpath, proxy=proxy) - - def __get_rss_base(self, url: str, cookie: str, ua: str, xpath: str, proxy: int) -> str: - """ - 默认获取站点rss地址 - """ - try: - get_rss_url = urljoin(url, "getrss.php") - rss_data = self.__get_rss_data(url) - res = RequestUtils(cookies=cookie, - timeout=60, - ua=ua, - proxies=settings.PROXY if proxy else None).post_res( - url=get_rss_url, data=rss_data) - if res: - html_text = res.text - else: - logger.error(f"获取rss失败:{url}") - return "" - html = etree.HTML(html_text) - if html: - rss_link = html.xpath(xpath) - if rss_link: - return str(rss_link[-1]) - return "" - except Exception as e: - print(str(e)) - return "" - - def __get_rss_ttg(self, url: str, cookie: str, ua: str, proxy: int) -> str: - """ - 获取ttg rss地址 - """ - try: - get_rss_url = urljoin(url, - "rsstools.php?c51=51&c52=52&c53=53&c54=54&c108=108&c109=109&c62=62&c63=63&c67=67&c69=69&c70=70&c73=73&c76=76&c75=75&c74=74&c87=87&c88=88&c99=99&c90=90&c58=58&c103=103&c101=101&c60=60") - res = RequestUtils(cookies=cookie, - timeout=60, - ua=ua, - proxies=settings.PROXY if proxy else None).get_res(url=get_rss_url) - if res: - html_text = res.text - else: - logger.error(f"获取rss失败:{url}") - return "" - html = etree.HTML(html_text) - if html: - rss_link = html.xpath("//textarea/text()") - if rss_link: - return str(rss_link[-1]) - return "" - except Exception as e: - print(str(e)) - return "" - - def __get_rss_monika(self, url: str, cookie: str, ua: str, proxy: int) -> str: - """ - 获取monikadesign rss地址 - """ - try: - get_rss_url = urljoin(url, "rss") - res = RequestUtils(cookies=cookie, - timeout=60, - ua=ua, - proxies=settings.PROXY if proxy else None).get_res(url=get_rss_url) - if res: - html_text = res.text - else: - logger.error(f"获取rss失败:{url}") - return "" - html = etree.HTML(html_text) - if html: - rss_link = html.xpath("//a/@href") - if rss_link: - return str(rss_link[0]) - return "" - except Exception as e: - print(str(e)) - return "" - - def __get_rss_ourbits(self, url: str, cookie: str, ua: str, proxy: int) -> str: - """ - 获取我堡rss地址 - """ - try: - get_rss_url = urljoin(url, "getrss.php") - html_text = PlaywrightHelper().get_page_source(url=get_rss_url, - cookies=cookie, - ua=ua, - proxies=settings.PROXY if proxy else None) - if html_text: - html = etree.HTML(html_text) - if html: - rss_link = html.xpath("//a[@class='gen_rsslink']/@href") - if rss_link: - return str(rss_link[-1]) - return "" - except Exception as e: - print(str(e)) - return "" - - def __get_rss_zhuque(self, url: str, cookie: str, ua: str, proxy: int) -> str: - """ - 获取zhuque rss地址 - """ - try: - get_rss_url = urljoin(url, "user/rss") - html_text = PlaywrightHelper().get_page_source(url=get_rss_url, - cookies=cookie, - ua=ua, - proxies=settings.PROXY if proxy else None) - if html_text: - html = etree.HTML(html_text) - if html: - rss_link = html.xpath("//a/@href") - if rss_link: - return str(rss_link[-1]) - return "" - except Exception as e: - print(str(e)) - return "" - - @staticmethod - def __get_rss_data(url: str) -> dict: - """ - 获取请求rss的参数,有的站不太一样,后续不断维护 - """ - _rss_data = { - "inclbookmarked": 0, - "itemsmalldescr": 1, - "showrows": 50, - "search_mode": 1, - } - - if 'hdchina.org' in url: - # 显示下载框 0全部 1仅下载框 - _rss_data['rsscart'] = 0 - - if 'audiences.me' in url: - # 种子类型 1新种与重置顶旧种 0只包含新种 - _rss_data['torrent_type'] = 1 - # RSS链接有效期: 180天 - _rss_data['exp'] = 180 - - if 'shadowflow.org' in url: - # 下载需扣除魔力 0不需要 1需要 2全部 - _rss_data['paid'] = 0 - _rss_data['search_mode'] = 0 - _rss_data['showrows'] = 30 - - if 'hddolby.com' in url: - # RSS链接有效期: 180天 - _rss_data['exp'] = 180 - - if 'hdhome.org' in url: - # RSS链接有效期: 180天 - _rss_data['exp'] = 180 - - if 'pthome.net' in url: - # RSS链接有效期: 180天 - _rss_data['exp'] = 180 - - if 'ptsbao.club' in url: - _rss_data['size'] = 0 - - if 'leaves.red' in url: - # 下载需扣除魔力 0不需要 1需要 2全部 - _rss_data['paid'] = 2 - _rss_data['search_mode'] = 0 - - if 'hdtime.org' in url: - _rss_data['search_mode'] = 0 - - if 'kp.m-team.cc' in url: - _rss_data = { - "showrows": 50, - "inclbookmarked": 0, - "itemsmalldescr": 1, - "https": 1 - } - - if 'u2.dmhy.org' in url: - # 显示自动通过的种子 0不显示自动通过的种子 1全部 - _rss_data['inclautochecked'] = 1 - # Tracker SSL 0不使用SSL 1使用SSL - _rss_data['trackerssl'] = 1 - - if 'www.pttime.org' in url: - _rss_data = { - "showrows": 10, - "inclbookmarked": 0, - "itemsmalldescr": 1 - } - - return _rss_data - @staticmethod def __parse_favicon(url: str, cookie: str, ua: str) -> Tuple[str, Optional[str]]: """ diff --git a/app/helper/rss.py b/app/helper/rss.py index ef53d59e..7285edcb 100644 --- a/app/helper/rss.py +++ b/app/helper/rss.py @@ -1,14 +1,225 @@ import xml.dom.minidom -from typing import List +from typing import List, Tuple +from urllib.parse import urljoin + +from lxml import etree from app.core.config import settings +from app.helper.browser import PlaywrightHelper from app.utils.dom import DomUtils from app.utils.http import RequestUtils from app.utils.string import StringUtils class RssHelper: - + """ + RSS帮助类,解析RSS报文、获取RSS地址等 + """ + # 各站点RSS链接获取配置 + rss_link_conf = { + "default": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + } + }, + "hares.top": { + "xpath": "//*[@id='layui-layer100001']/div[2]/div/p[4]/a/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + } + }, + "et8.org": { + "xpath": "//*[@id='outer']/table/tbody/tr/td/table/tbody/tr/td/a[2]/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + } + }, + "pttime.org": { + "xpath": "//*[@id='outer']/table/tbody/tr/td/table/tbody/tr/td/text()[5]", + "url": "getrss.php", + "params": { + "showrows": 10, + "inclbookmarked": 0, + "itemsmalldescr": 1 + } + }, + "ourbits.club": { + "xpath": "//a[@class='gen_rsslink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + } + }, + "totheglory.im": { + "xpath": "//textarea/text()", + "url": "rsstools.php?c51=51&c52=52&c53=53&c54=54&c108=108&c109=109&c62=62&c63=63&c67=67&c69=69&c70=70&c73=73&c76=76&c75=75&c74=74&c87=87&c88=88&c99=99&c90=90&c58=58&c103=103&c101=101&c60=60", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + } + }, + "monikadesign.uk": { + "xpath": "//a/@href", + "url": "rss", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + } + }, + "zhuque.in": { + "xpath": "//a/@href", + "url": "user/rss", + "render": True, + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + } + }, + "hdchina.org": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + "rsscart": 0 + } + }, + "audiences.me": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + "torrent_type": 1, + "exp": 180 + } + }, + "shadowflow.org": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "paid": 0, + "search_mode": 0, + "showrows": 30 + } + }, + "hddolby.com": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + "exp": 180 + } + }, + "hdhome.org": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + "exp": 180 + } + }, + "pthome.net": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + "exp": 180 + } + }, + "ptsbao.club": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + "size": 0 + } + }, + "leaves.red": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 0, + "paid": 2 + } + }, + "hdtime.org": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 0, + } + }, + "m-team.io": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "showrows": 50, + "inclbookmarked": 0, + "itemsmalldescr": 1, + "https": 1 + } + }, + "u2.dmhy.org": { + "xpath": "//a[@class='faqlink']/@href", + "url": "getrss.php", + "params": { + "inclbookmarked": 0, + "itemsmalldescr": 1, + "showrows": 50, + "search_mode": 1, + "inclautochecked": 1, + "trackerssl": 1 + } + }, + } + @staticmethod def parse(url, proxy: bool = False) -> List[dict]: """ @@ -78,3 +289,52 @@ class RssHelper: except Exception as e2: print(str(e2)) return ret_array + + def get_rss_link(self, url: str, cookie: str, ua: str, proxy: bool = False) -> Tuple[str, str]: + """ + 获取站点rss地址 + :param url: 站点地址 + :param cookie: 站点cookie + :param ua: 站点ua + :param proxy: 是否使用代理 + :return: rss地址、错误信息 + """ + try: + # 获取站点域名 + domain = StringUtils.get_url_domain(url) + # 获取配置 + site_conf = self.rss_link_conf.get(domain) or self.rss_link_conf.get("default") + # RSS地址 + rss_url = urljoin(url, site_conf.get("url")) + # RSS请求参数 + rss_params = site_conf.get("params") + # 请求RSS页面 + if site_conf.get("render"): + html_text = PlaywrightHelper().get_page_source( + url=rss_url, + cookies=cookie, + ua=ua, + proxies=settings.PROXY if proxy else None + ) + else: + res = RequestUtils( + cookies=cookie, + timeout=60, + ua=ua, + proxies=settings.PROXY if proxy else None + ).post_res(url=rss_url, data=rss_params) + if res: + html_text = res.text + elif res is not None: + return "", f"获取 {url} RSS链接失败,错误码:{res.status_code},错误原因:{res.reason}" + else: + return "", f"获取RSS链接失败:无法连接 {url} " + # 解析HTML + html = etree.HTML(html_text) + if html: + rss_link = html.xpath(site_conf.get("xpath")) + if rss_link: + return str(rss_link[-1]), "" + return "", f"获取RSS链接失败:{url}" + except Exception as e: + return "", f"获取 {url} RSS链接失败:{str(e)}"