From 80c2d13aede07a086897c73eb132adcbbb63fce8 Mon Sep 17 00:00:00 2001 From: jmmzon Date: Wed, 20 Sep 2023 13:04:29 +0200 Subject: [PATCH] Added --user-scripts-timeout parameter (#3) Added --user-scripts-timeout parameter that allows a long-running async tasks to succeed. --- README.md | 1 + scrapper/core/__init__.py | 3 +++ scrapper/util/argutil.py | 5 +++++ 3 files changed, 9 insertions(+) diff --git a/README.md b/README.md index d5f70e1..2cf0195 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,7 @@ All other request parameters are optional and have default values. However, you | `stealth` | Stealth mode allows you to bypass anti-scraping techniques. It is disabled by default. | `false` | | `screenshot` | If this option is set to true, the result will have the link to the screenshot of the page (`screenshot` field in the response). Important implementation details: Initially, Scrapper attempts to take a screenshot of the entire scrollable page. If it fails because the image is too large, it will only capture the currently visible viewport. | `false` | | `user-scripts` | To use your JavaScript scripts on the page, add script files to the `user_scripts` directory, and list the required ones (separated by commas) in the `user-scripts` parameter. These scripts will execute after the page loads but before the article parser runs. This allows you to help parse the article in a variety of ways, such as removing markup, ad blocks, or anything else. For example: `user-scripts=remove_ads.js, click_cookie_accept_button.js` | | +| `user-scripts-timeout` | Waits for the given timeout in milliseconds after users scripts injection. For example if you want to navigate through page to specific content, set a longer period (higher value). The default value is 0, which means no sleep. | `0` | #### Playwright settings | Parameter | Description | Default | diff --git a/scrapper/core/__init__.py b/scrapper/core/__init__.py index cfe6e98..4c86aa2 100644 --- a/scrapper/core/__init__.py +++ b/scrapper/core/__init__.py @@ -74,6 +74,9 @@ def page_processing(page, args, init_scripts=None): for script_name in args.user_scripts: page.add_script_tag(path=USER_SCRIPTS / script_name) + # wait for the given timeout in miliseconds after user scripts were injected. + if args.user_scripts_timeout: + page.wait_for_timeout(args.user_scripts_timeout) def resource_blocker(whitelist): # list of resource types to allow def block(route): diff --git a/scrapper/util/argutil.py b/scrapper/util/argutil.py index 8557284..cbbb9b6 100644 --- a/scrapper/util/argutil.py +++ b/scrapper/util/argutil.py @@ -129,7 +129,12 @@ def f(name, val): # and list the required ones (separated by commas) in the `user-scripts` parameter. These scripts will execute after the page loads # but before the article parser runs. This allows you to help parse the article in a variety of ways, # such as removing markup, ad blocks, or anything else. For example: user-scripts=remove_ads.js, click_cookie_accept_button.js + # If you plan to run asynchronous long-running scripts, check --user_scripts_timeout parameter. ('user-scripts', (is_list,), None), + # Waits for the given timeout in milliseconds after injecting users scripts. + # For example if you want to navigate through page to specific content, set a longer period (higher value) + # The default value is 0, which means no sleep. + ('user-scripts-timeout', (is_number, gte(0)), 0), # # # Playwright settings: