diff --git a/app/helper/browser.py b/app/helper/browser.py index 2b0b474e..99381d5b 100644 --- a/app/helper/browser.py +++ b/app/helper/browser.py @@ -29,6 +29,7 @@ class PlaywrightHelper: if cookies: page.set_extra_http_headers({"cookie": cookies}) try: + page.goto(url) page.wait_for_load_state("networkidle", timeout=timeout * 1000) source = page.content() diff --git a/app/helper/cloudflare.py b/app/helper/cloudflare.py deleted file mode 100644 index cc0945b1..00000000 --- a/app/helper/cloudflare.py +++ /dev/null @@ -1,201 +0,0 @@ -import time -import os - -from func_timeout import func_timeout, FunctionTimedOut -from pyquery import PyQuery -from selenium.common import TimeoutException -from selenium.webdriver import ActionChains -from selenium.webdriver.common.by import By -from selenium.webdriver.remote.webdriver import WebDriver -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.support.wait import WebDriverWait - -from app.log import logger - -ACCESS_DENIED_TITLES = [ - # Cloudflare - 'Access denied', - # Cloudflare http://bitturk.net/ Firefox - 'Attention Required! | Cloudflare' -] -ACCESS_DENIED_SELECTORS = [ - # Cloudflare - 'div.cf-error-title span.cf-code-label span', - # Cloudflare http://bitturk.net/ Firefox - '#cf-error-details div.cf-error-overview h1' -] -CHALLENGE_TITLES = [ - # Cloudflare - 'Just a moment...', - '请稍候…', - # DDoS-GUARD - 'DDOS-GUARD', -] -CHALLENGE_SELECTORS = [ - # Cloudflare - '#cf-challenge-running', '.ray_id', '.attack-box', '#cf-please-wait', '#challenge-spinner', '#trk_jschal_js', - # Custom CloudFlare for EbookParadijs, Film-Paleis, MuziekFabriek and Puur-Hollands - 'td.info #js_info', - # Fairlane / pararius.com - 'div.vc div.text-box h2' -] -SHORT_TIMEOUT = 6 -CF_TIMEOUT = int(os.getenv("NASTOOL_CF_TIMEOUT", "60")) - - -def resolve_challenge(driver: WebDriver, timeout=CF_TIMEOUT): - start_ts = time.time() - try: - func_timeout(timeout, _evil_logic, args=(driver,)) - return True - except FunctionTimedOut: - logger.error(f'Error solving the challenge. Timeout {timeout} after {round(time.time() - start_ts, 1)} seconds.') - return False - except Exception as e: - logger.error('Error solving the challenge. ' + str(e)) - return False - - -def under_challenge(html_text: str): - """ - Check if the page is under challenge - :param html_text: - :return: - """ - # get the page title - if not html_text: - return False - page_title = PyQuery(html_text)('title').text() - logger.debug("under_challenge page_title=" + page_title) - for title in CHALLENGE_TITLES: - if page_title.lower() == title.lower(): - return True - for selector in CHALLENGE_SELECTORS: - html_doc = PyQuery(html_text) - if html_doc(selector): - return True - return False - - -def _until_title_changes(driver: WebDriver, titles): - WebDriverWait(driver, SHORT_TIMEOUT).until_not(lambda x: _any_match_titles(x, titles)) - - -def _any_match_titles(driver: WebDriver, titles): - page_title = driver.title - for title in titles: - if page_title.lower() == title.lower(): - return True - return False - - -def _until_selectors_disappear(driver: WebDriver, selectors): - WebDriverWait(driver, SHORT_TIMEOUT).until_not(lambda x: _any_match_selectors(x, selectors)) - - -def _any_match_selectors(driver: WebDriver, selectors): - for selector in selectors: - html_doc = PyQuery(driver.page_source) - if html_doc(selector): - return True - return False - - -def _evil_logic(driver: WebDriver): - driver.implicitly_wait(SHORT_TIMEOUT) - # wait for the page - html_element = driver.find_element(By.TAG_NAME, "html") - - # find access denied titles - if _any_match_titles(driver, ACCESS_DENIED_TITLES): - raise Exception('Cloudflare has blocked this request. ' - 'Probably your IP is banned for this site, check in your web browser.') - # find access denied selectors - if _any_match_selectors(driver, ACCESS_DENIED_SELECTORS): - raise Exception('Cloudflare has blocked this request. ' - 'Probably your IP is banned for this site, check in your web browser.') - - # find challenge by title - challenge_found = False - if _any_match_titles(driver, CHALLENGE_TITLES): - challenge_found = True - logger.info("Challenge detected. Title found: " + driver.title) - if not challenge_found: - # find challenge by selectors - if _any_match_selectors(driver, CHALLENGE_SELECTORS): - challenge_found = True - logger.info("Challenge detected. Selector found") - - attempt = 0 - if challenge_found: - while True: - try: - attempt = attempt + 1 - # wait until the title changes - _until_title_changes(driver, CHALLENGE_TITLES) - - # then wait until all the selectors disappear - _until_selectors_disappear(driver, CHALLENGE_SELECTORS) - - # all elements not found - break - - except TimeoutException: - logger.debug("Timeout waiting for selector") - - click_verify(driver) - - # update the html (cloudflare reloads the page every 5 s) - html_element = driver.find_element(By.TAG_NAME, "html") - - # waits until cloudflare redirection ends - logger.debug("Waiting for redirect") - # noinspection PyBroadException - try: - WebDriverWait(driver, SHORT_TIMEOUT).until(EC.staleness_of(html_element)) - except Exception: - logger.debug("Timeout waiting for redirect") - - logger.info("Challenge solved!") - else: - logger.info("Challenge not detected!") - - -def click_verify(driver: WebDriver): - try: - logger.debug("Try to find the Cloudflare verify checkbox") - iframe = driver.find_element(By.XPATH, "//iframe[@title='Widget containing a Cloudflare security challenge']") - driver.switch_to.frame(iframe) - checkbox = driver.find_element( - by=By.XPATH, - value='//*[@id="cf-stage"]//label[@class="ctp-checkbox-label"]/input', - ) - if checkbox: - actions = ActionChains(driver) - actions.move_to_element_with_offset(checkbox, 5, 7) - actions.click(checkbox) - actions.perform() - logger.debug("Cloudflare verify checkbox found and clicked") - except Exception as e: - logger.debug(f"Cloudflare verify checkbox not found on the page: {str(e)}") - # print(e) - finally: - driver.switch_to.default_content() - - try: - logger.debug("Try to find the Cloudflare 'Verify you are human' button") - button = driver.find_element( - by=By.XPATH, - value="//input[@type='button' and @value='Verify you are human']", - ) - if button: - actions = ActionChains(driver) - actions.move_to_element_with_offset(button, 5, 7) - actions.click(button) - actions.perform() - logger.debug("The Cloudflare 'Verify you are human' button found and clicked") - except Exception as e: - logger.debug(f"The Cloudflare 'Verify you are human' button not found on the page:{str(e)}") - # print(e) - - time.sleep(2) diff --git a/app/helper/sites.cp310-win_amd64.pyd b/app/helper/sites.cp310-win_amd64.pyd index 97a90112..e3c9bcde 100644 Binary files a/app/helper/sites.cp310-win_amd64.pyd and b/app/helper/sites.cp310-win_amd64.pyd differ diff --git a/app/helper/sites.cpython-310-x86_64-linux-gnu.so b/app/helper/sites.cpython-310-x86_64-linux-gnu.so index 2a591d34..e5473e9e 100644 Binary files a/app/helper/sites.cpython-310-x86_64-linux-gnu.so and b/app/helper/sites.cpython-310-x86_64-linux-gnu.so differ