From 54e153d74860c271eb31142644ad088a0b4f6569 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 10:40:25 +0200 Subject: [PATCH 1/8] docs: Add Scrapling guide --- docs/01_introduction/quick-start.mdx | 1 + docs/03_guides/09_scrapling.mdx | 123 +++++++++++++++++++++++++++ docs/03_guides/code/09_scrapling.py | 95 +++++++++++++++++++++ 3 files changed, 219 insertions(+) create mode 100644 docs/03_guides/09_scrapling.mdx create mode 100644 docs/03_guides/code/09_scrapling.py diff --git a/docs/01_introduction/quick-start.mdx b/docs/01_introduction/quick-start.mdx index da166da96..c0f8bec37 100644 --- a/docs/01_introduction/quick-start.mdx +++ b/docs/01_introduction/quick-start.mdx @@ -105,4 +105,5 @@ To see how you can integrate the Apify SDK with popular web scraping libraries, - [Selenium](../guides/selenium) - [Crawlee](../guides/crawlee) - [Scrapy](../guides/scrapy) +- [Scrapling](../guides/scrapling) - [Running webserver](../guides/running-webserver) diff --git a/docs/03_guides/09_scrapling.mdx b/docs/03_guides/09_scrapling.mdx new file mode 100644 index 000000000..459e5a255 --- /dev/null +++ b/docs/03_guides/09_scrapling.mdx @@ -0,0 +1,123 @@ +--- +id: scrapling +title: Use Scrapling +description: Build an Apify Actor that scrapes web pages using the Scrapling adaptive web scraping library. +--- + +import CodeBlock from '@theme/CodeBlock'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; + +import ScraplingExample from '!!raw-loader!roa-loader!./code/09_scrapling.py'; + +In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. + +## Introduction + +[Scrapling](https://scrapling.readthedocs.io/) is an adaptive web scraping library for Python that combines fetching and parsing behind a single, high-level API. It can fetch a page with fast HTTP requests or with a real browser, parse the result with familiar CSS selectors and XPath, and even relocate your selectors automatically when a website's structure changes. + +Some of the features that make Scrapling a good fit for Apify Actors: + +- **Multiple fetchers** - A single API exposes a fast HTTP client with browser TLS-fingerprint impersonation, as well as full browser automation for JavaScript-heavy or protected pages. +- **Adaptive selectors** - Scrapling can remember the elements you scraped and find them again after a website redesign, so your scrapers keep working with fewer manual fixes. +- **Anti-bot evasion** - Built-in stealth features (browser impersonation, realistic headers, and automatic Cloudflare Turnstile solving with the browser fetchers) help you avoid being blocked. +- **Familiar parsing API** - Elements are selected with CSS selectors (including the `::text` and `::attr()` pseudo-elements) or XPath, with a Scrapy/Parsel-like `.get()` and `.getall()` interface. +- **First-class async support** - Every fetcher has an asynchronous variant, which integrates naturally with the asyncio-based Apify SDK. + +Scrapling's parser works on its own, while the fetchers are an optional extra. Install Scrapling with the `fetchers` extra to get the HTTP and browser fetchers: + +```bash +pip install "scrapling[fetchers]" +``` + +## Choosing a fetcher + +All of Scrapling's fetchers are importable from `scrapling.fetchers`. Pick the one that matches the website you're scraping: + +- **`Fetcher` / `AsyncFetcher`** - Plain HTTP requests via `.get()`, `.post()`, `.put()`, and `.delete()`. Fast and lightweight, with optional browser TLS-fingerprint impersonation (`impersonate`) and realistic headers (`stealthy_headers`). This is the best choice for static pages and APIs, and it needs no browser binaries. +- **`DynamicFetcher` / `DynamicSession`** - Full browser automation based on [Playwright](https://playwright.dev/), for pages that require JavaScript rendering or interaction. Fetch a page with `.fetch()` or its async variant `.async_fetch()`. +- **`StealthyFetcher` / `StealthySession`** - A stealth-hardened browser fetcher that can automatically solve Cloudflare Turnstile challenges (`solve_cloudflare=True`). Use it for the most heavily protected websites. + +The returned `Response` object is also a Scrapling selector, so you can call `.css()`, `.xpath()`, `.find_all()`, and the other parsing methods on it directly. + +The HTTP fetchers work with just the `scrapling[fetchers]` extra. The browser-based fetchers (`DynamicFetcher` and `StealthyFetcher`) additionally need browser binaries, which you download with the `scrapling install` command - see [Running browser-based fetchers](#running-browser-based-fetchers) below. + +The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simplest to deploy and pairs well with Apify Proxy. + +## Example Actor + +The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. + + + {ScraplingExample} + + +A few things worth pointing out: + +- The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. +- `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. +- The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. + +## Using Apify Proxy + +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example above creates a proxy configuration and passes a fresh proxy URL to every request: + +```python +proxy_configuration = await Actor.create_proxy_configuration() +... +proxy_url = None +if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + +response = await AsyncFetcher.get(url, proxy=proxy_url) +``` + +Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. + +## Running browser-based fetchers + +`DynamicFetcher` and `StealthyFetcher` drive a real browser, so they need the browser binaries installed with the `scrapling install` command. Locally, run it once after installing the `scrapling[fetchers]` extra: + +```bash +scrapling install +``` + +On the Apify platform, the Actor runs in a Docker container, so the browsers have to be installed during the image build. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and then download the browser binaries that Scrapling expects: + + +{`FROM apify/actor-python-playwright:3.14-1.60.0 + +COPY --chown=myuser:myuser requirements.txt ./ +RUN pip install -r requirements.txt + +# Download the browser binaries Scrapling needs. The base image already provides +# their system-level dependencies, so run this step as root. +USER root +RUN scrapling install +USER myuser + +COPY --chown=myuser:myuser . ./ +RUN python -m compileall -q my_actor/ + +CMD ["python", "-m", "my_actor"]`} + + +Fetching a page then only differs in which fetcher you call - the parsing API is identical: + +```python +from scrapling.fetchers import DynamicFetcher + +response = await DynamicFetcher.async_fetch(url, headless=True, network_idle=True) +quotes = response.css('.quote .text::text').getall() +``` + +## Conclusion + +In this guide, you learned how to use Scrapling in your Apify Actors. You can now fetch pages with Scrapling's HTTP or browser-based fetchers, extract data with its CSS and XPath selectors, route requests through Apify Proxy, and run the whole thing on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! + +## Additional resources + +- [Scrapling: Official documentation](https://scrapling.readthedocs.io/) +- [Scrapling: Fetchers](https://scrapling.readthedocs.io/en/latest/fetching/choosing/) +- [Scrapling: Parsing and selecting elements](https://scrapling.readthedocs.io/en/latest/parsing/selection/) +- [Scrapling: GitHub repository](https://github.com/D4Vinci/Scrapling) +- [Apify: Proxy management](https://docs.apify.com/platform/proxy) diff --git a/docs/03_guides/code/09_scrapling.py b/docs/03_guides/code/09_scrapling.py new file mode 100644 index 000000000..fed1b5ae8 --- /dev/null +++ b/docs/03_guides/code/09_scrapling.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +from scrapling.fetchers import AsyncFetcher + +from apify import Actor, Request + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + new_request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(new_request) + + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an unexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy is set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Fetch the page with Scrapling's asynchronous HTTP fetcher. The + # `impersonate` and `stealthy_headers` options make the request look + # like it comes from a real Chrome browser, reducing the chance of + # being blocked. The returned response is also a Scrapling selector. + response = await AsyncFetcher.get( + url, + proxy=proxy_url, + impersonate='chrome', + stealthy_headers=True, + timeout=60, + ) + + # If the current depth is less than max_depth, find nested links + # and enqueue them. The `::attr(href)` pseudo-selector reads the + # attribute, and `response.urljoin` resolves it against the page URL. + if depth < max_depth: + for link_href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(link_href) + + if link_url.startswith(('http://', 'https://')): + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + # Extract the desired data using Scrapling's CSS selectors. The + # `::text` pseudo-element returns the text content of the elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled to ensure it is not processed again. + await request_queue.mark_request_as_handled(request) From 29c4c8a8a35a83410f998ad8f6d6efea0f9decbf Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 11:24:10 +0200 Subject: [PATCH 2/8] docs: Split Scrapling guide example into modules and use code tabs --- docs/03_guides/09_scrapling.mdx | 93 +++++++++--------- docs/03_guides/code/09_scrapling.py | 95 ------------------- .../code/scrapling_browser_project/Dockerfile | 21 ++++ .../my_actor/scraper.py | 45 +++++++++ .../scrapling_project/my_actor/__init__.py | 0 .../scrapling_project/my_actor/__main__.py | 8 ++ .../code/scrapling_project/my_actor/main.py | 71 ++++++++++++++ .../scrapling_project/my_actor/scraper.py | 47 +++++++++ pyproject.toml | 4 + 9 files changed, 245 insertions(+), 139 deletions(-) delete mode 100644 docs/03_guides/code/09_scrapling.py create mode 100644 docs/03_guides/code/scrapling_browser_project/Dockerfile create mode 100644 docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/__init__.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/__main__.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/main.py create mode 100644 docs/03_guides/code/scrapling_project/my_actor/scraper.py diff --git a/docs/03_guides/09_scrapling.mdx b/docs/03_guides/09_scrapling.mdx index 459e5a255..3e76ebcac 100644 --- a/docs/03_guides/09_scrapling.mdx +++ b/docs/03_guides/09_scrapling.mdx @@ -5,9 +5,14 @@ description: Build an Apify Actor that scrapes web pages using the Scrapling ada --- import CodeBlock from '@theme/CodeBlock'; -import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; -import ScraplingExample from '!!raw-loader!roa-loader!./code/09_scrapling.py'; +import ScraplingMain from '!!raw-loader!./code/scrapling_project/my_actor/main.py'; +import ScraplingScraper from '!!raw-loader!./code/scrapling_project/my_actor/scraper.py'; +import ScraplingEntrypoint from '!!raw-loader!./code/scrapling_project/my_actor/__main__.py'; +import ScraplingBrowserScraper from '!!raw-loader!./code/scrapling_browser_project/my_actor/scraper.py'; +import ScraplingBrowserDockerfile from '!!raw-loader!./code/scrapling_browser_project/Dockerfile'; In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. @@ -47,29 +52,40 @@ The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simpl The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. - - {ScraplingExample} - +The code is split into three small modules, following the structure of the Apify Python Actor templates: + +- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl. +- `my_actor/scraper.py` - The Scrapling-specific logic. A single `scrape_page` function fetches a page and returns the extracted data together with the links found on it. +- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. + + + + + {ScraplingMain} + + + + + {ScraplingScraper} + + + + + {ScraplingEntrypoint} + + + A few things worth pointing out: +- Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. - The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. - `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. - The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. ## Using Apify Proxy -Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. The example above creates a proxy configuration and passes a fresh proxy URL to every request: - -```python -proxy_configuration = await Actor.create_proxy_configuration() -... -proxy_url = None -if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - -response = await AsyncFetcher.get(url, proxy=proxy_url) -``` +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. @@ -81,34 +97,23 @@ Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy scrapling install ``` -On the Apify platform, the Actor runs in a Docker container, so the browsers have to be installed during the image build. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and then download the browser binaries that Scrapling expects: - - -{`FROM apify/actor-python-playwright:3.14-1.60.0 - -COPY --chown=myuser:myuser requirements.txt ./ -RUN pip install -r requirements.txt - -# Download the browser binaries Scrapling needs. The base image already provides -# their system-level dependencies, so run this step as root. -USER root -RUN scrapling install -USER myuser - -COPY --chown=myuser:myuser . ./ -RUN python -m compileall -q my_actor/ - -CMD ["python", "-m", "my_actor"]`} - - -Fetching a page then only differs in which fetcher you call - the parsing API is identical: - -```python -from scrapling.fetchers import DynamicFetcher - -response = await DynamicFetcher.async_fetch(url, headless=True, network_idle=True) -quotes = response.css('.quote .text::text').getall() -``` +Switching the example Actor from HTTP to a real browser only takes two changes - the rest of the project, including `my_actor/main.py`, stays exactly the same: + +1. Swap the fetcher call in `my_actor/scraper.py` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the data extraction is unchanged. +2. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the build to download the browser binaries that Scrapling expects. + + + + + {ScraplingBrowserScraper} + + + + + {ScraplingBrowserDockerfile} + + + ## Conclusion diff --git a/docs/03_guides/code/09_scrapling.py b/docs/03_guides/code/09_scrapling.py deleted file mode 100644 index fed1b5ae8..000000000 --- a/docs/03_guides/code/09_scrapling.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import annotations - -from scrapling.fetchers import AsyncFetcher - -from apify import Actor, Request - - -async def main() -> None: - # Enter the context of the Actor. - async with Actor: - # Retrieve the Actor input, and use default values if not provided. - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) - max_depth = actor_input.get('max_depth', 1) - - # Exit if no start URLs are provided. - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Create a proxy configuration that routes requests through Apify Proxy. - proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. - request_queue = await Actor.open_request_queue() - - # Enqueue the start URLs with an initial crawl depth of 0. - for start_url in start_urls: - url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') - new_request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(new_request) - - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Get a fresh proxy URL for each request (None if no proxy is set up). - proxy_url = None - if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - - # Fetch the page with Scrapling's asynchronous HTTP fetcher. The - # `impersonate` and `stealthy_headers` options make the request look - # like it comes from a real Chrome browser, reducing the chance of - # being blocked. The returned response is also a Scrapling selector. - response = await AsyncFetcher.get( - url, - proxy=proxy_url, - impersonate='chrome', - stealthy_headers=True, - timeout=60, - ) - - # If the current depth is less than max_depth, find nested links - # and enqueue them. The `::attr(href)` pseudo-selector reads the - # attribute, and `response.urljoin` resolves it against the page URL. - if depth < max_depth: - for link_href in response.css('a::attr(href)').getall(): - link_url = response.urljoin(link_href) - - if link_url.startswith(('http://', 'https://')): - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) - await request_queue.add_request(new_request) - - # Extract the desired data using Scrapling's CSS selectors. The - # `::text` pseudo-element returns the text content of the elements. - data = { - 'url': url, - 'title': response.css('title::text').get(), - 'h1s': response.css('h1::text').getall(), - 'h2s': response.css('h2::text').getall(), - 'h3s': response.css('h3::text').getall(), - } - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled to ensure it is not processed again. - await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/scrapling_browser_project/Dockerfile b/docs/03_guides/code/scrapling_browser_project/Dockerfile new file mode 100644 index 000000000..38b30c605 --- /dev/null +++ b/docs/03_guides/code/scrapling_browser_project/Dockerfile @@ -0,0 +1,21 @@ +# Use the Apify Playwright base image, which already ships a browser together +# with all of its system-level dependencies. +FROM apify/actor-python-playwright:3.14-1.60.0 + +# Copy just requirements.txt first to leverage the Docker build cache. +COPY --chown=myuser:myuser requirements.txt ./ +RUN pip install -r requirements.txt + +# Download the browser binaries that Scrapling expects. The base image already +# provides their system-level dependencies, so run this step as root and then +# switch back to the unprivileged user. +USER root +RUN scrapling install +USER myuser + +# Copy the rest of the source code and verify that it compiles. +COPY --chown=myuser:myuser . ./ +RUN python -m compileall -q my_actor/ + +# Specify how to launch the Actor. +CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py b/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py new file mode 100644 index 000000000..fb7d4579d --- /dev/null +++ b/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from typing import Any + +from scrapling.fetchers import DynamicFetcher + + +async def scrape_page( + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page in a real browser and extract its data and links. + + `DynamicFetcher` drives a real browser via Playwright, so it can render + JavaScript-heavy pages. `network_idle` waits until the page stops making + network requests before the HTML is captured. Apart from the fetcher call, + everything else - including the parsing - is identical to the HTTP version. + """ + response = await DynamicFetcher.async_fetch( + url, + proxy=proxy_url, + headless=True, + network_idle=True, + ) + + # Extract the desired data using CSS selectors. The `::text` pseudo-element + # returns the text content of the matched elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Collect absolute links from the page. The `::attr(href)` pseudo-selector + # reads the attribute and `response.urljoin` resolves it against the page URL. + links: list[str] = [] + for href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links diff --git a/docs/03_guides/code/scrapling_project/my_actor/__init__.py b/docs/03_guides/code/scrapling_project/my_actor/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/docs/03_guides/code/scrapling_project/my_actor/__main__.py b/docs/03_guides/code/scrapling_project/my_actor/__main__.py new file mode 100644 index 000000000..6aeaf3d5d --- /dev/null +++ b/docs/03_guides/code/scrapling_project/my_actor/__main__.py @@ -0,0 +1,8 @@ +from __future__ import annotations + +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/scrapling_project/my_actor/main.py b/docs/03_guides/code/scrapling_project/my_actor/main.py new file mode 100644 index 000000000..d2cd36e75 --- /dev/null +++ b/docs/03_guides/code/scrapling_project/my_actor/main.py @@ -0,0 +1,71 @@ +from __future__ import annotations + +from apify import Actor, Request + +from .scraper import scrape_page + + +async def main() -> None: + # Enter the context of the Actor. + async with Actor: + # Retrieve the Actor input, and use default values if not provided. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('max_depth', 1) + + # Exit if no start URLs are provided. + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Create a proxy configuration that routes requests through Apify Proxy. + proxy_configuration = await Actor.create_proxy_configuration() + + # Open the default request queue for handling URLs to be processed. + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs with an initial crawl depth of 0. + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing {url} ...') + request = Request.from_url(url, user_data={'depth': 0}) + await request_queue.add_request(request) + + # Process the URLs from the request queue. + while request := await request_queue.fetch_next_request(): + url = request.url + + if not isinstance(request.user_data['depth'], (str, int)): + raise TypeError('Request.depth is an unexpected type.') + + depth = int(request.user_data['depth']) + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Get a fresh proxy URL for each request (None if no proxy set up). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + # Fetch the page and extract its data and nested links. + data, links = await scrape_page(url, proxy_url=proxy_url) + + # Store the extracted data to the default dataset. + await Actor.push_data(data) + + # If we are not too deep yet, enqueue the links we found. + if depth < max_depth: + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + new_request = Request.from_url( + link_url, + user_data={'depth': depth + 1}, + ) + await request_queue.add_request(new_request) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + # Mark the request as handled so it is not processed again. + await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/scrapling_project/my_actor/scraper.py b/docs/03_guides/code/scrapling_project/my_actor/scraper.py new file mode 100644 index 000000000..b840db829 --- /dev/null +++ b/docs/03_guides/code/scrapling_project/my_actor/scraper.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from typing import Any + +from scrapling.fetchers import AsyncFetcher + + +async def scrape_page( + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Fetch a single page with Scrapling and extract its data and links. + + The page is fetched with Scrapling's asynchronous HTTP fetcher. The + `impersonate` and `stealthy_headers` options make the request look like it + comes from a real Chrome browser, which reduces the chance of being blocked. + The returned response is also a Scrapling selector, so it can be queried with + CSS selectors directly. + """ + response = await AsyncFetcher.get( + url, + proxy=proxy_url, + impersonate='chrome', + stealthy_headers=True, + timeout=60, + ) + + # Extract the desired data using CSS selectors. The `::text` pseudo-element + # returns the text content of the matched elements. + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Collect absolute links from the page. The `::attr(href)` pseudo-selector + # reads the attribute and `response.urljoin` resolves it against the page URL. + links: list[str] = [] + for href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(href) + if link_url.startswith(('http://', 'https://')): + links.append(link_url) + + return data, links diff --git a/pyproject.toml b/pyproject.toml index d17bdc013..d8697219f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,6 +181,10 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] +"**/docs/**/scrapling_project/**" = [ + # Local imports are mixed up with the Apify SDK. + "I001", # Import block is un-sorted or un-formatted +] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" From 2a41a3f3e19b1e664adcbe35a39bfdacc58e816d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 12:00:53 +0200 Subject: [PATCH 3/8] docs: use Request.crawl_depth for depth tracking in Scrapling example --- .../code/scrapling_project/my_actor/main.py | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/docs/03_guides/code/scrapling_project/my_actor/main.py b/docs/03_guides/code/scrapling_project/my_actor/main.py index d2cd36e75..52e9ef4cb 100644 --- a/docs/03_guides/code/scrapling_project/my_actor/main.py +++ b/docs/03_guides/code/scrapling_project/my_actor/main.py @@ -24,21 +24,18 @@ async def main() -> None: # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() - # Enqueue the start URLs with an initial crawl depth of 0. + # Enqueue the start URLs. Their crawl depth defaults to 0. for start_url in start_urls: url = start_url.get('url') Actor.log.info(f'Enqueuing {url} ...') - request = Request.from_url(url, user_data={'depth': 0}) - await request_queue.add_request(request) + await request_queue.add_request(Request.from_url(url)) # Process the URLs from the request queue. while request := await request_queue.fetch_next_request(): url = request.url - if not isinstance(request.user_data['depth'], (str, int)): - raise TypeError('Request.depth is an unexpected type.') - - depth = int(request.user_data['depth']) + # Read the crawl depth tracked by the request itself. + depth = request.crawl_depth Actor.log.info(f'Scraping {url} (depth={depth}) ...') try: @@ -53,14 +50,13 @@ async def main() -> None: # Store the extracted data to the default dataset. await Actor.push_data(data) - # If we are not too deep yet, enqueue the links we found. + # If we are not too deep yet, enqueue the links we found one + # level deeper than the current page. if depth < max_depth: for link_url in links: Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url( - link_url, - user_data={'depth': depth + 1}, - ) + new_request = Request.from_url(link_url) + new_request.crawl_depth = depth + 1 await request_queue.add_request(new_request) except Exception: From 910df14999f02c3f22e9fac77322148a4f0630e2 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 20:45:03 +0200 Subject: [PATCH 4/8] docs: renumber Scrapling guide to 07 and switch to a single-file example --- .../{09_scrapling.mdx => 07_scrapling.mdx} | 74 +++-------- docs/03_guides/code/07_scrapling.py | 122 ++++++++++++++++++ .../scraper.py => 07_scrapling_browser.py} | 16 +-- .../code/scrapling_browser_project/Dockerfile | 21 --- .../scrapling_project/my_actor/__init__.py | 0 .../scrapling_project/my_actor/__main__.py | 8 -- .../code/scrapling_project/my_actor/main.py | 67 ---------- .../scrapling_project/my_actor/scraper.py | 47 ------- 8 files changed, 146 insertions(+), 209 deletions(-) rename docs/03_guides/{09_scrapling.mdx => 07_scrapling.mdx} (63%) create mode 100644 docs/03_guides/code/07_scrapling.py rename docs/03_guides/code/{scrapling_browser_project/my_actor/scraper.py => 07_scrapling_browser.py} (52%) delete mode 100644 docs/03_guides/code/scrapling_browser_project/Dockerfile delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/__init__.py delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/__main__.py delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/main.py delete mode 100644 docs/03_guides/code/scrapling_project/my_actor/scraper.py diff --git a/docs/03_guides/09_scrapling.mdx b/docs/03_guides/07_scrapling.mdx similarity index 63% rename from docs/03_guides/09_scrapling.mdx rename to docs/03_guides/07_scrapling.mdx index 3e76ebcac..63e948e59 100644 --- a/docs/03_guides/09_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -1,20 +1,16 @@ --- id: scrapling -title: Use Scrapling +title: Adaptive scraping with Scrapling description: Build an Apify Actor that scrapes web pages using the Scrapling adaptive web scraping library. --- import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; -import ScraplingMain from '!!raw-loader!./code/scrapling_project/my_actor/main.py'; -import ScraplingScraper from '!!raw-loader!./code/scrapling_project/my_actor/scraper.py'; -import ScraplingEntrypoint from '!!raw-loader!./code/scrapling_project/my_actor/__main__.py'; -import ScraplingBrowserScraper from '!!raw-loader!./code/scrapling_browser_project/my_actor/scraper.py'; -import ScraplingBrowserDockerfile from '!!raw-loader!./code/scrapling_browser_project/Dockerfile'; +import ScraplingExample from '!!raw-loader!roa-loader!./code/07_scrapling.py'; +import ScraplingBrowserScraper from '!!raw-loader!./code/07_scrapling_browser.py'; -In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library in your Apify Actors. +In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthedocs.io/) library for adaptive web scraping in your Apify Actors. ## Introduction @@ -50,42 +46,24 @@ The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simpl ## Example Actor -The following Actor recursively scrapes titles from all linked pages, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. - -The code is split into three small modules, following the structure of the Apify Python Actor templates: - -- `my_actor/main.py` - The Actor's main coroutine. It handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl. -- `my_actor/scraper.py` - The Scrapling-specific logic. A single `scrape_page` function fetches a page and returns the extracted data together with the links found on it. -- `my_actor/__main__.py` - The entry point that runs the `main` coroutine with `asyncio`. - - - - - {ScraplingMain} - - - - - {ScraplingScraper} - - - - - {ScraplingEntrypoint} - - - +The following Actor recursively scrapes data from linked pages on the same site, up to a user-defined maximum depth, starting from the URLs in the Actor input. It uses Scrapling's `AsyncFetcher` to fetch each page through [Apify Proxy](https://docs.apify.com/platform/proxy), and CSS selectors to extract the title, headings, and links. + +The whole Actor fits in a single file. A `scrape_page` helper holds the Scrapling-specific fetching and parsing, while the `main` coroutine handles the [Actor](https://docs.apify.com/platform/actors) lifecycle, reads the input, sets up [Apify Proxy](https://docs.apify.com/platform/proxy) and the [request queue](https://docs.apify.com/platform/storage/request-queue), and drives the crawl: + + + {ScraplingExample} + A few things worth pointing out: -- Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `my_actor/main.py` decides what to store and what to enqueue. +- Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `main` decides what to store and what to enqueue. - The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. - `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. - The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. ## Using Apify Proxy -Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `my_actor/main.py` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. +Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `main` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. @@ -97,23 +75,13 @@ Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy scrapling install ``` -Switching the example Actor from HTTP to a real browser only takes two changes - the rest of the project, including `my_actor/main.py`, stays exactly the same: - -1. Swap the fetcher call in `my_actor/scraper.py` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the data extraction is unchanged. -2. Build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the build to download the browser binaries that Scrapling expects. - - - - - {ScraplingBrowserScraper} - - - - - {ScraplingBrowserDockerfile} - - - +Switching the example Actor from HTTP to a real browser takes only one code change - swap the `AsyncFetcher.get` call in `scrape_page` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the rest of the Actor stays exactly the same: + + + {ScraplingBrowserScraper} + + +To run this on the Apify platform, build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the Docker build to download the browser binaries that Scrapling expects. ## Conclusion diff --git a/docs/03_guides/code/07_scrapling.py b/docs/03_guides/code/07_scrapling.py new file mode 100644 index 000000000..49aab31be --- /dev/null +++ b/docs/03_guides/code/07_scrapling.py @@ -0,0 +1,122 @@ +import asyncio +from typing import Any +from urllib.parse import urlsplit + +from scrapling.fetchers import AsyncFetcher + +from apify import Actor, Request +from apify.storages import RequestQueue + + +async def scrape_page( + url: str, + *, + proxy_url: str | None = None, +) -> tuple[dict[str, Any], list[str]]: + """Fetch a page with Scrapling's HTTP fetcher and return data and links.""" + # `impersonate` and `stealthy_headers` make the request look like Chrome. + response = await AsyncFetcher.get( + url, + proxy=proxy_url, + impersonate='chrome', + stealthy_headers=True, + timeout=60, + ) + + data = { + 'url': url, + 'title': response.css('title::text').get(), + 'h1s': response.css('h1::text').getall(), + 'h2s': response.css('h2::text').getall(), + 'h3s': response.css('h3::text').getall(), + } + + # Keep only absolute links on the same host. + links: list[str] = [] + host = urlsplit(url).netloc + for href in response.css('a::attr(href)').getall(): + link_url = response.urljoin(href) + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: + links.append(link_url) + + return data, links + + +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the links one level deeper, unless max_depth was reached.""" + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + +async def main() -> None: + async with Actor: + # Read the Actor input. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) + + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Set up Apify Proxy and the request queue. + proxy_configuration = await Actor.create_proxy_configuration() + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs (crawl depth defaults to 0). + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing start URL: {url}') + await request_queue.add_request(Request.from_url(url)) + + # Cap the crawl; raise or remove to follow more pages. + max_requests = 50 + handled_requests = 0 + + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 + url = request.url + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Fresh proxy URL per request (None if no proxy). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + data, links = await scrape_page(url, proxy_url=proxy_url) + await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py b/docs/03_guides/code/07_scrapling_browser.py similarity index 52% rename from docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py rename to docs/03_guides/code/07_scrapling_browser.py index fb7d4579d..3eb50e244 100644 --- a/docs/03_guides/code/scrapling_browser_project/my_actor/scraper.py +++ b/docs/03_guides/code/07_scrapling_browser.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import Any from scrapling.fetchers import DynamicFetcher @@ -10,13 +8,8 @@ async def scrape_page( *, proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page in a real browser and extract its data and links. - - `DynamicFetcher` drives a real browser via Playwright, so it can render - JavaScript-heavy pages. `network_idle` waits until the page stops making - network requests before the HTML is captured. Apart from the fetcher call, - everything else - including the parsing - is identical to the HTTP version. - """ + """Fetch a page in a real browser with Scrapling and return data and links.""" + # `network_idle` waits until the page stops making network requests. response = await DynamicFetcher.async_fetch( url, proxy=proxy_url, @@ -24,8 +17,6 @@ async def scrape_page( network_idle=True, ) - # Extract the desired data using CSS selectors. The `::text` pseudo-element - # returns the text content of the matched elements. data = { 'url': url, 'title': response.css('title::text').get(), @@ -34,8 +25,7 @@ async def scrape_page( 'h3s': response.css('h3::text').getall(), } - # Collect absolute links from the page. The `::attr(href)` pseudo-selector - # reads the attribute and `response.urljoin` resolves it against the page URL. + # Collect absolute links from the page. links: list[str] = [] for href in response.css('a::attr(href)').getall(): link_url = response.urljoin(href) diff --git a/docs/03_guides/code/scrapling_browser_project/Dockerfile b/docs/03_guides/code/scrapling_browser_project/Dockerfile deleted file mode 100644 index 38b30c605..000000000 --- a/docs/03_guides/code/scrapling_browser_project/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -# Use the Apify Playwright base image, which already ships a browser together -# with all of its system-level dependencies. -FROM apify/actor-python-playwright:3.14-1.60.0 - -# Copy just requirements.txt first to leverage the Docker build cache. -COPY --chown=myuser:myuser requirements.txt ./ -RUN pip install -r requirements.txt - -# Download the browser binaries that Scrapling expects. The base image already -# provides their system-level dependencies, so run this step as root and then -# switch back to the unprivileged user. -USER root -RUN scrapling install -USER myuser - -# Copy the rest of the source code and verify that it compiles. -COPY --chown=myuser:myuser . ./ -RUN python -m compileall -q my_actor/ - -# Specify how to launch the Actor. -CMD ["python", "-m", "my_actor"] diff --git a/docs/03_guides/code/scrapling_project/my_actor/__init__.py b/docs/03_guides/code/scrapling_project/my_actor/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/03_guides/code/scrapling_project/my_actor/__main__.py b/docs/03_guides/code/scrapling_project/my_actor/__main__.py deleted file mode 100644 index 6aeaf3d5d..000000000 --- a/docs/03_guides/code/scrapling_project/my_actor/__main__.py +++ /dev/null @@ -1,8 +0,0 @@ -from __future__ import annotations - -import asyncio - -from .main import main - -if __name__ == '__main__': - asyncio.run(main()) diff --git a/docs/03_guides/code/scrapling_project/my_actor/main.py b/docs/03_guides/code/scrapling_project/my_actor/main.py deleted file mode 100644 index 52e9ef4cb..000000000 --- a/docs/03_guides/code/scrapling_project/my_actor/main.py +++ /dev/null @@ -1,67 +0,0 @@ -from __future__ import annotations - -from apify import Actor, Request - -from .scraper import scrape_page - - -async def main() -> None: - # Enter the context of the Actor. - async with Actor: - # Retrieve the Actor input, and use default values if not provided. - actor_input = await Actor.get_input() or {} - start_urls = actor_input.get('start_urls', [{'url': 'https://crawlee.dev'}]) - max_depth = actor_input.get('max_depth', 1) - - # Exit if no start URLs are provided. - if not start_urls: - Actor.log.info('No start URLs specified in Actor input, exiting...') - await Actor.exit() - - # Create a proxy configuration that routes requests through Apify Proxy. - proxy_configuration = await Actor.create_proxy_configuration() - - # Open the default request queue for handling URLs to be processed. - request_queue = await Actor.open_request_queue() - - # Enqueue the start URLs. Their crawl depth defaults to 0. - for start_url in start_urls: - url = start_url.get('url') - Actor.log.info(f'Enqueuing {url} ...') - await request_queue.add_request(Request.from_url(url)) - - # Process the URLs from the request queue. - while request := await request_queue.fetch_next_request(): - url = request.url - - # Read the crawl depth tracked by the request itself. - depth = request.crawl_depth - Actor.log.info(f'Scraping {url} (depth={depth}) ...') - - try: - # Get a fresh proxy URL for each request (None if no proxy set up). - proxy_url = None - if proxy_configuration: - proxy_url = await proxy_configuration.new_url() - - # Fetch the page and extract its data and nested links. - data, links = await scrape_page(url, proxy_url=proxy_url) - - # Store the extracted data to the default dataset. - await Actor.push_data(data) - - # If we are not too deep yet, enqueue the links we found one - # level deeper than the current page. - if depth < max_depth: - for link_url in links: - Actor.log.info(f'Enqueuing {link_url} ...') - new_request = Request.from_url(link_url) - new_request.crawl_depth = depth + 1 - await request_queue.add_request(new_request) - - except Exception: - Actor.log.exception(f'Cannot extract data from {url}.') - - finally: - # Mark the request as handled so it is not processed again. - await request_queue.mark_request_as_handled(request) diff --git a/docs/03_guides/code/scrapling_project/my_actor/scraper.py b/docs/03_guides/code/scrapling_project/my_actor/scraper.py deleted file mode 100644 index b840db829..000000000 --- a/docs/03_guides/code/scrapling_project/my_actor/scraper.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -from typing import Any - -from scrapling.fetchers import AsyncFetcher - - -async def scrape_page( - url: str, - *, - proxy_url: str | None = None, -) -> tuple[dict[str, Any], list[str]]: - """Fetch a single page with Scrapling and extract its data and links. - - The page is fetched with Scrapling's asynchronous HTTP fetcher. The - `impersonate` and `stealthy_headers` options make the request look like it - comes from a real Chrome browser, which reduces the chance of being blocked. - The returned response is also a Scrapling selector, so it can be queried with - CSS selectors directly. - """ - response = await AsyncFetcher.get( - url, - proxy=proxy_url, - impersonate='chrome', - stealthy_headers=True, - timeout=60, - ) - - # Extract the desired data using CSS selectors. The `::text` pseudo-element - # returns the text content of the matched elements. - data = { - 'url': url, - 'title': response.css('title::text').get(), - 'h1s': response.css('h1::text').getall(), - 'h2s': response.css('h2::text').getall(), - 'h3s': response.css('h3::text').getall(), - } - - # Collect absolute links from the page. The `::attr(href)` pseudo-selector - # reads the attribute and `response.urljoin` resolves it against the page URL. - links: list[str] = [] - for href in response.css('a::attr(href)').getall(): - link_url = response.urljoin(href) - if link_url.startswith(('http://', 'https://')): - links.append(link_url) - - return data, links From 404bdfb23d4e951b3d63f9660f8f6ef6e8107533 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 5 Jun 2026 21:02:44 +0200 Subject: [PATCH 5/8] chore: drop unused ruff ignore for the removed Scrapling project --- pyproject.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d8697219f..d17bdc013 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,10 +181,6 @@ indent-style = "space" # Local imports in Scrapy project. "TID252", # Prefer absolute imports over relative imports from parent modules ] -"**/docs/**/scrapling_project/**" = [ - # Local imports are mixed up with the Apify SDK. - "I001", # Import block is un-sorted or un-formatted -] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" From 55ad62ab8755e09d2ddbc38e8e57e5522400038a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Mon, 8 Jun 2026 13:48:07 +0200 Subject: [PATCH 6/8] docs: reduce clause-gluing dashes in the Scrapling guide --- docs/03_guides/07_scrapling.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/03_guides/07_scrapling.mdx b/docs/03_guides/07_scrapling.mdx index 63e948e59..579f385a5 100644 --- a/docs/03_guides/07_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -40,7 +40,7 @@ All of Scrapling's fetchers are importable from `scrapling.fetchers`. Pick the o The returned `Response` object is also a Scrapling selector, so you can call `.css()`, `.xpath()`, `.find_all()`, and the other parsing methods on it directly. -The HTTP fetchers work with just the `scrapling[fetchers]` extra. The browser-based fetchers (`DynamicFetcher` and `StealthyFetcher`) additionally need browser binaries, which you download with the `scrapling install` command - see [Running browser-based fetchers](#running-browser-based-fetchers) below. +The HTTP fetchers work with just the `scrapling[fetchers]` extra. The browser-based fetchers (`DynamicFetcher` and `StealthyFetcher`) additionally need browser binaries, which you download with the `scrapling install` command. See [Running browser-based fetchers](#running-browser-based-fetchers) below. The example Actor in this guide uses the HTTP `AsyncFetcher`, which is the simplest to deploy and pairs well with Apify Proxy. @@ -59,7 +59,7 @@ A few things worth pointing out: - Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `main` decides what to store and what to enqueue. - The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. - `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. -- The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which - combined with Apify Proxy - reduces the chance of being blocked. +- The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which, combined with Apify Proxy, reduces the chance of being blocked. ## Using Apify Proxy @@ -75,7 +75,7 @@ Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy scrapling install ``` -Switching the example Actor from HTTP to a real browser takes only one code change - swap the `AsyncFetcher.get` call in `scrape_page` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the rest of the Actor stays exactly the same: +Switching the example Actor from HTTP to a real browser takes only one code change. Swap the `AsyncFetcher.get` call in `scrape_page` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the rest of the Actor stays exactly the same: {ScraplingBrowserScraper} From 440a30ed6bf4582242914eb1d1bbc19b91b90049 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 9 Jun 2026 10:47:19 +0200 Subject: [PATCH 7/8] docs: adjust wording style --- docs/03_guides/07_scrapling.mdx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/03_guides/07_scrapling.mdx b/docs/03_guides/07_scrapling.mdx index 579f385a5..cfa075042 100644 --- a/docs/03_guides/07_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -16,15 +16,15 @@ In this guide, you'll learn how to use the [Scrapling](https://scrapling.readthe [Scrapling](https://scrapling.readthedocs.io/) is an adaptive web scraping library for Python that combines fetching and parsing behind a single, high-level API. It can fetch a page with fast HTTP requests or with a real browser, parse the result with familiar CSS selectors and XPath, and even relocate your selectors automatically when a website's structure changes. -Some of the features that make Scrapling a good fit for Apify Actors: +Scrapling is a great fit for Apify Actors: -- **Multiple fetchers** - A single API exposes a fast HTTP client with browser TLS-fingerprint impersonation, as well as full browser automation for JavaScript-heavy or protected pages. -- **Adaptive selectors** - Scrapling can remember the elements you scraped and find them again after a website redesign, so your scrapers keep working with fewer manual fixes. -- **Anti-bot evasion** - Built-in stealth features (browser impersonation, realistic headers, and automatic Cloudflare Turnstile solving with the browser fetchers) help you avoid being blocked. -- **Familiar parsing API** - Elements are selected with CSS selectors (including the `::text` and `::attr()` pseudo-elements) or XPath, with a Scrapy/Parsel-like `.get()` and `.getall()` interface. -- **First-class async support** - Every fetcher has an asynchronous variant, which integrates naturally with the asyncio-based Apify SDK. +- A single API exposes a fast HTTP client with browser TLS-fingerprint impersonation, as well as full browser automation for JavaScript-heavy or protected pages. +- Scrapling can remember the elements you scraped and find them again after a website redesign. Your scrapers keep working with fewer manual fixes. +- Built-in stealth features (browser impersonation, realistic headers, and automatic Cloudflare Turnstile solving with the browser fetchers) help you avoid being blocked. +- Elements are selected with CSS selectors (including the `::text` and `::attr()` pseudo-elements) or XPath, with a Scrapy/Parsel-like `.get()` and `.getall()` interface. +- Every fetcher has an asynchronous variant, which integrates naturally with the asyncio-based Apify SDK. -Scrapling's parser works on its own, while the fetchers are an optional extra. Install Scrapling with the `fetchers` extra to get the HTTP and browser fetchers: +Scrapling's parser works on its own. The fetchers are an optional extra. Install Scrapling with the `fetchers` extra to get the HTTP and browser fetchers: ```bash pip install "scrapling[fetchers]" @@ -54,7 +54,7 @@ The whole Actor fits in a single file. A `scrape_page` helper holds the Scraplin {ScraplingExample} -A few things worth pointing out: +Note that: - Keeping the fetching and parsing in `scrape_page` separates the Scrapling-specific code from the Actor's orchestration logic. The function returns the extracted data together with the discovered links, so `main` decides what to store and what to enqueue. - The response of `AsyncFetcher.get` is a Scrapling selector, so `response.css('title::text').get()` reads the page title and `response.css('a::attr(href)').getall()` returns every link's `href` in one call. @@ -65,7 +65,7 @@ A few things worth pointing out: Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `main` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. -Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For more details, see the [Proxy management](../concepts/proxy-management) guide. The browser-based fetchers accept the same `proxy` argument. +Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy.apify.com:8000`), which is exactly what `ProxyConfiguration.new_url` returns. To select specific proxy groups or a country, pass the relevant arguments to `Actor.create_proxy_configuration`. For details, see [Proxy management](../concepts/proxy-management). The browser-based fetchers accept the same `proxy` argument. ## Running browser-based fetchers @@ -75,7 +75,7 @@ Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy scrapling install ``` -Switching the example Actor from HTTP to a real browser takes only one code change. Swap the `AsyncFetcher.get` call in `scrape_page` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the rest of the Actor stays exactly the same: +Switching the example Actor from HTTP to a real browser takes only one code change. Swap the `AsyncFetcher.get` call in `scrape_page` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the rest of the Actor stays the same: {ScraplingBrowserScraper} @@ -85,7 +85,7 @@ To run this on the Apify platform, build on top of the [Apify Playwright base im ## Conclusion -In this guide, you learned how to use Scrapling in your Apify Actors. You can now fetch pages with Scrapling's HTTP or browser-based fetchers, extract data with its CSS and XPath selectors, route requests through Apify Proxy, and run the whole thing on the Apify platform. See the [Actor templates](https://apify.com/templates/categories/python) to get started with your own scraping tasks. If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! +In this guide, you learned how to use Scrapling in your Apify Actors. You can now fetch pages with Scrapling's HTTP or browser-based fetchers, extract data with its CSS and XPath selectors, route requests through Apify Proxy, and run the whole thing on the Apify platform. To get started with your own scraping tasks, see the [Actor templates](https://apify.com/templates/categories/python). If you have questions or need assistance, feel free to reach out on our [GitHub](https://github.com/apify/apify-sdk-python) or join our [Discord community](https://discord.com/invite/jyEM2PRvMU). Happy scraping! ## Additional resources From 1578540965a9e8e20fc82910f70e79a9f9884ccb Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 9 Jun 2026 11:30:28 +0200 Subject: [PATCH 8/8] docs: address review comments on Scrapling guide --- docs/03_guides/07_scrapling.mdx | 49 ++++++++- docs/03_guides/code/07_scrapling_browser.py | 104 ++++++++++++++++++-- 2 files changed, 141 insertions(+), 12 deletions(-) diff --git a/docs/03_guides/07_scrapling.mdx b/docs/03_guides/07_scrapling.mdx index cfa075042..cc1fae334 100644 --- a/docs/03_guides/07_scrapling.mdx +++ b/docs/03_guides/07_scrapling.mdx @@ -61,6 +61,30 @@ Note that: - `response.urljoin(link_href)` resolves relative links against the page URL, so you can enqueue them directly. - The `impersonate='chrome'` and `stealthy_headers=True` options make the request look like it comes from a real Chrome browser, which, combined with Apify Proxy, reduces the chance of being blocked. +## Adaptive selectors + +The example above uses plain CSS selectors. Scrapling can also track the elements you scrape and relocate them when a website changes its markup, so a redesign doesn't immediately break your scraper. This is most useful for scrapers that revisit the same pages over time, rather than one-off crawls. + +1. Enable adaptive matching once on the fetcher: + + ```python + AsyncFetcher.configure(adaptive=True) + ``` + +2. On the first run, pass `auto_save=True` when you select an element. Scrapling records a fingerprint of that element, keyed by the selector: + + ```python + title = response.css('h1.product-title::text', auto_save=True).get() + ``` + +3. On a later run, if the selector no longer matches because the page changed, pass `adaptive=True` with the same selector. Scrapling uses the saved fingerprint to find the element in its new location: + + ```python + title = response.css('h1.product-title::text', adaptive=True).get() + ``` + +Scrapling keeps these fingerprints in a local SQLite database. On the Apify platform the Actor's filesystem doesn't persist between runs, so to keep them across runs, store that database in a [key-value store](https://docs.apify.com/platform/storage/key-value-store) and restore it on startup. For details, see [Scrapling's adaptive parsing documentation](https://scrapling.readthedocs.io/en/latest/parsing/adaptive.html). + ## Using Apify Proxy Running on the Apify platform gives your scraper access to [Apify Proxy](https://docs.apify.com/platform/proxy), which rotates IP addresses to avoid rate limiting and blocking. In the example above, `main` creates a proxy configuration with `Actor.create_proxy_configuration` and passes a fresh proxy URL to `scrape_page` for every request, which forwards it to Scrapling's `proxy` argument. @@ -75,13 +99,33 @@ Scrapling accepts the proxy as a URL string (for example `http://user:pass@proxy scrapling install ``` -Switching the example Actor from HTTP to a real browser takes only one code change. Swap the `AsyncFetcher.get` call in `scrape_page` for `DynamicFetcher.async_fetch`. The parsing API is identical, so the rest of the Actor stays the same: +To switch the example from HTTP to a real browser, fetch each page through a browser session instead of `AsyncFetcher`. Opening a fresh browser for every page would be wasteful, so `main` enters an `AsyncDynamicSession` once and reuses it for the whole crawl, while `scrape_page` fetches with `session.fetch`. The parsing API is identical, so the extraction code stays the same: {ScraplingBrowserScraper} -To run this on the Apify platform, build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the Docker build to download the browser binaries that Scrapling expects. +Note that: + +- `AsyncDynamicSession` launches one browser and keeps it open across `session.fetch` calls, so the crawl doesn't pay the browser-startup cost on every page. +- The proxy URL is passed per fetch, so each page can go through a fresh Apify Proxy IP while sharing the same browser. + +To run this on the Apify platform, build on top of the [Apify Playwright base image](https://hub.docker.com/r/apify/actor-python-playwright), which already ships a browser together with all of its system-level dependencies, and run `scrapling install` during the Docker build to download the browser binaries that Scrapling expects: + +```docker title="Dockerfile" +FROM apify/actor-python-playwright:3.14 + +# Install the Actor's Python dependencies. +COPY requirements.txt ./ +RUN pip install -r requirements.txt + +# Download the browser binaries that Scrapling's browser fetchers need. +RUN scrapling install + +# Copy in the source code and launch the Actor as a module. +COPY . ./ +CMD ["python", "-m", "src"] +``` ## Conclusion @@ -92,5 +136,6 @@ In this guide, you learned how to use Scrapling in your Apify Actors. You can no - [Scrapling: Official documentation](https://scrapling.readthedocs.io/) - [Scrapling: Fetchers](https://scrapling.readthedocs.io/en/latest/fetching/choosing/) - [Scrapling: Parsing and selecting elements](https://scrapling.readthedocs.io/en/latest/parsing/selection/) +- [Scrapling: Adaptive parsing](https://scrapling.readthedocs.io/en/latest/parsing/adaptive.html) - [Scrapling: GitHub repository](https://github.com/D4Vinci/Scrapling) - [Apify: Proxy management](https://docs.apify.com/platform/proxy) diff --git a/docs/03_guides/code/07_scrapling_browser.py b/docs/03_guides/code/07_scrapling_browser.py index 3eb50e244..566fcc6da 100644 --- a/docs/03_guides/code/07_scrapling_browser.py +++ b/docs/03_guides/code/07_scrapling_browser.py @@ -1,21 +1,22 @@ +import asyncio from typing import Any +from urllib.parse import urlsplit -from scrapling.fetchers import DynamicFetcher +from scrapling.fetchers import AsyncDynamicSession + +from apify import Actor, Request +from apify.storages import RequestQueue async def scrape_page( + session: AsyncDynamicSession, url: str, *, proxy_url: str | None = None, ) -> tuple[dict[str, Any], list[str]]: - """Fetch a page in a real browser with Scrapling and return data and links.""" + """Fetch a page through the shared browser session and return data and links.""" # `network_idle` waits until the page stops making network requests. - response = await DynamicFetcher.async_fetch( - url, - proxy=proxy_url, - headless=True, - network_idle=True, - ) + response = await session.fetch(url, proxy=proxy_url, network_idle=True) data = { 'url': url, @@ -25,11 +26,94 @@ async def scrape_page( 'h3s': response.css('h3::text').getall(), } - # Collect absolute links from the page. + # Keep only absolute links on the same host. links: list[str] = [] + host = urlsplit(url).netloc for href in response.css('a::attr(href)').getall(): link_url = response.urljoin(href) - if link_url.startswith(('http://', 'https://')): + if not link_url.startswith(('http://', 'https://')): + continue + if urlsplit(link_url).netloc == host: links.append(link_url) return data, links + + +async def enqueue_links( + request_queue: RequestQueue, + links: list[str], + *, + depth: int, + max_depth: int, +) -> None: + """Enqueue the links one level deeper, unless max_depth was reached.""" + if depth >= max_depth: + return + + for link_url in links: + Actor.log.info(f'Enqueuing {link_url} ...') + request = Request.from_url(link_url) + request.crawl_depth = depth + 1 + await request_queue.add_request(request) + + +async def main() -> None: + async with Actor: + # Read the Actor input. + actor_input = await Actor.get_input() or {} + start_urls = actor_input.get('startUrls', [{'url': 'https://crawlee.dev'}]) + max_depth = actor_input.get('maxDepth', 1) + + if not start_urls: + Actor.log.info('No start URLs specified in Actor input, exiting...') + await Actor.exit() + + # Set up Apify Proxy and the request queue. + proxy_configuration = await Actor.create_proxy_configuration() + request_queue = await Actor.open_request_queue() + + # Enqueue the start URLs (crawl depth defaults to 0). + for start_url in start_urls: + url = start_url.get('url') + Actor.log.info(f'Enqueuing start URL: {url}') + await request_queue.add_request(Request.from_url(url)) + + # Cap the crawl; raise or remove to follow more pages. + max_requests = 50 + handled_requests = 0 + + # Open the browser once and reuse it for every page in the crawl. + async with AsyncDynamicSession(headless=True) as session: + while handled_requests < max_requests and ( + request := await request_queue.fetch_next_request() + ): + handled_requests += 1 + url = request.url + depth = request.crawl_depth + Actor.log.info(f'Scraping {url} (depth={depth}) ...') + + try: + # Fresh proxy URL per request (None if no proxy). + proxy_url = None + if proxy_configuration: + proxy_url = await proxy_configuration.new_url() + + data, links = await scrape_page(session, url, proxy_url=proxy_url) + await Actor.push_data(data) + Actor.log.info( + f'Stored data from {url} ' + f'(title={data["title"]!r}, {len(links)} links found).' + ) + await enqueue_links( + request_queue, links, depth=depth, max_depth=max_depth + ) + + except Exception: + Actor.log.exception(f'Cannot extract data from {url}.') + + finally: + await request_queue.mark_request_as_handled(request) + + +if __name__ == '__main__': + asyncio.run(main())