373 lines
13 KiB
Python
373 lines
13 KiB
Python
import re
|
||
import traceback
|
||
import xml.dom.minidom
|
||
from typing import List, Tuple, Union
|
||
from urllib.parse import urljoin
|
||
|
||
import chardet
|
||
from lxml import etree
|
||
|
||
from app.core.config import settings
|
||
from app.helper.browser import PlaywrightHelper
|
||
from app.log import logger
|
||
from app.utils.dom import DomUtils
|
||
from app.utils.http import RequestUtils
|
||
from app.utils.string import StringUtils
|
||
|
||
|
||
class RssHelper:
|
||
"""
|
||
RSS帮助类,解析RSS报文、获取RSS地址等
|
||
"""
|
||
# 各站点RSS链接获取配置
|
||
rss_link_conf = {
|
||
"default": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
}
|
||
},
|
||
"hares.top": {
|
||
"xpath": "//*[@id='layui-layer100001']/div[2]/div/p[4]/a/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
}
|
||
},
|
||
"et8.org": {
|
||
"xpath": "//*[@id='outer']/table/tbody/tr/td/table/tbody/tr/td/a[2]/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
}
|
||
},
|
||
"pttime.org": {
|
||
"xpath": "//*[@id='outer']/table/tbody/tr/td/table/tbody/tr/td/text()[5]",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"showrows": 10,
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1
|
||
}
|
||
},
|
||
"ourbits.club": {
|
||
"xpath": "//a[@class='gen_rsslink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
}
|
||
},
|
||
"totheglory.im": {
|
||
"xpath": "//textarea/text()",
|
||
"url": "rsstools.php?c51=51&c52=52&c53=53&c54=54&c108=108&c109=109&c62=62&c63=63&c67=67&c69=69&c70=70&c73=73&c76=76&c75=75&c74=74&c87=87&c88=88&c99=99&c90=90&c58=58&c103=103&c101=101&c60=60",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
}
|
||
},
|
||
"monikadesign.uk": {
|
||
"xpath": "//a/@href",
|
||
"url": "rss",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
}
|
||
},
|
||
"zhuque.in": {
|
||
"xpath": "//a/@href",
|
||
"url": "user/rss",
|
||
"render": True,
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
}
|
||
},
|
||
"hdchina.org": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
"rsscart": 0
|
||
}
|
||
},
|
||
"audiences.me": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
"torrent_type": 1,
|
||
"exp": 180
|
||
}
|
||
},
|
||
"shadowflow.org": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"paid": 0,
|
||
"search_mode": 0,
|
||
"showrows": 30
|
||
}
|
||
},
|
||
"hddolby.com": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
"exp": 180
|
||
}
|
||
},
|
||
"hdhome.org": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
"exp": 180
|
||
}
|
||
},
|
||
"pthome.net": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
"exp": 180
|
||
}
|
||
},
|
||
"ptsbao.club": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
"size": 0
|
||
}
|
||
},
|
||
"leaves.red": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 0,
|
||
"paid": 2
|
||
}
|
||
},
|
||
"hdtime.org": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 0,
|
||
}
|
||
},
|
||
"m-team.io": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"showrows": 50,
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"https": 1
|
||
}
|
||
},
|
||
"u2.dmhy.org": {
|
||
"xpath": "//a[@class='faqlink']/@href",
|
||
"url": "getrss.php",
|
||
"params": {
|
||
"inclbookmarked": 0,
|
||
"itemsmalldescr": 1,
|
||
"showrows": 50,
|
||
"search_mode": 1,
|
||
"inclautochecked": 1,
|
||
"trackerssl": 1
|
||
}
|
||
},
|
||
}
|
||
|
||
@staticmethod
|
||
def parse(url, proxy: bool = False, timeout: int = 30) -> Union[List[dict], None]:
|
||
"""
|
||
解析RSS订阅URL,获取RSS中的种子信息
|
||
:param url: RSS地址
|
||
:param proxy: 是否使用代理
|
||
:param timeout: 请求超时
|
||
:return: 种子信息列表,如为None代表Rss过期
|
||
"""
|
||
# 开始处理
|
||
ret_array: list = []
|
||
if not url:
|
||
return []
|
||
try:
|
||
ret = RequestUtils(proxies=settings.PROXY if proxy else None, timeout=timeout).get_res(url)
|
||
if not ret:
|
||
return []
|
||
except Exception as err:
|
||
logger.error(f"获取RSS失败:{str(err)} - {traceback.format_exc()}")
|
||
return []
|
||
if ret:
|
||
ret_xml = ""
|
||
try:
|
||
# 使用chardet检测字符编码
|
||
raw_data = ret.content
|
||
if raw_data:
|
||
try:
|
||
result = chardet.detect(raw_data)
|
||
encoding = result['encoding']
|
||
# 解码为字符串
|
||
ret_xml = raw_data.decode(encoding)
|
||
except Exception as e:
|
||
logger.debug(f"chardet解码失败:{str(e)}")
|
||
# 探测utf-8解码
|
||
match = re.search(r'encoding\s*=\s*["\']([^"\']+)["\']', ret.text)
|
||
if match:
|
||
encoding = match.group(1)
|
||
if encoding:
|
||
ret_xml = raw_data.decode(encoding)
|
||
else:
|
||
ret.encoding = ret.apparent_encoding
|
||
if not ret_xml:
|
||
ret_xml = ret.text
|
||
# 解析XML
|
||
dom_tree = xml.dom.minidom.parseString(ret_xml)
|
||
rootNode = dom_tree.documentElement
|
||
items = rootNode.getElementsByTagName("item")
|
||
for item in items:
|
||
try:
|
||
# 标题
|
||
title = DomUtils.tag_value(item, "title", default="")
|
||
if not title:
|
||
continue
|
||
# 描述
|
||
description = DomUtils.tag_value(item, "description", default="")
|
||
# 种子页面
|
||
link = DomUtils.tag_value(item, "link", default="")
|
||
# 种子链接
|
||
enclosure = DomUtils.tag_value(item, "enclosure", "url", default="")
|
||
if not enclosure and not link:
|
||
continue
|
||
# 部分RSS只有link没有enclosure
|
||
if not enclosure and link:
|
||
enclosure = link
|
||
# 大小
|
||
size = DomUtils.tag_value(item, "enclosure", "length", default=0)
|
||
if size and str(size).isdigit():
|
||
size = int(size)
|
||
else:
|
||
size = 0
|
||
# 发布日期
|
||
pubdate = DomUtils.tag_value(item, "pubDate", default="")
|
||
if pubdate:
|
||
# 转换为时间
|
||
pubdate = StringUtils.get_time(pubdate)
|
||
# 返回对象
|
||
tmp_dict = {'title': title,
|
||
'enclosure': enclosure,
|
||
'size': size,
|
||
'description': description,
|
||
'link': link,
|
||
'pubdate': pubdate}
|
||
ret_array.append(tmp_dict)
|
||
except Exception as e1:
|
||
logger.debug(f"解析RSS失败:{str(e1)} - {traceback.format_exc()}")
|
||
continue
|
||
except Exception as e2:
|
||
logger.error(f"解析RSS失败:{str(e2)} - {traceback.format_exc()}")
|
||
# RSS过期 观众RSS 链接已过期,您需要获得一个新的! pthome RSS Link has expired, You need to get a new one!
|
||
_rss_expired_msg = [
|
||
"RSS 链接已过期, 您需要获得一个新的!",
|
||
"RSS Link has expired, You need to get a new one!",
|
||
"RSS Link has expired, You need to get new!"
|
||
]
|
||
if ret_xml in _rss_expired_msg:
|
||
return None
|
||
return ret_array
|
||
|
||
def get_rss_link(self, url: str, cookie: str, ua: str, proxy: bool = False) -> Tuple[str, str]:
|
||
"""
|
||
获取站点rss地址
|
||
:param url: 站点地址
|
||
:param cookie: 站点cookie
|
||
:param ua: 站点ua
|
||
:param proxy: 是否使用代理
|
||
:return: rss地址、错误信息
|
||
"""
|
||
try:
|
||
# 获取站点域名
|
||
domain = StringUtils.get_url_domain(url)
|
||
# 获取配置
|
||
site_conf = self.rss_link_conf.get(domain) or self.rss_link_conf.get("default")
|
||
# RSS地址
|
||
rss_url = urljoin(url, site_conf.get("url"))
|
||
# RSS请求参数
|
||
rss_params = site_conf.get("params")
|
||
# 请求RSS页面
|
||
if site_conf.get("render"):
|
||
html_text = PlaywrightHelper().get_page_source(
|
||
url=rss_url,
|
||
cookies=cookie,
|
||
ua=ua,
|
||
proxies=settings.PROXY if proxy else None
|
||
)
|
||
else:
|
||
res = RequestUtils(
|
||
cookies=cookie,
|
||
timeout=60,
|
||
ua=ua,
|
||
proxies=settings.PROXY if proxy else None
|
||
).post_res(url=rss_url, data=rss_params)
|
||
if res:
|
||
html_text = res.text
|
||
elif res is not None:
|
||
return "", f"获取 {url} RSS链接失败,错误码:{res.status_code},错误原因:{res.reason}"
|
||
else:
|
||
return "", f"获取RSS链接失败:无法连接 {url} "
|
||
# 解析HTML
|
||
html = etree.HTML(html_text)
|
||
if html:
|
||
rss_link = html.xpath(site_conf.get("xpath"))
|
||
if rss_link:
|
||
return str(rss_link[-1]), ""
|
||
return "", f"获取RSS链接失败:{url}"
|
||
except Exception as e:
|
||
return "", f"获取 {url} RSS链接失败:{str(e)}"
|