CDP cookies vs document.cookie
set_cookie() uses Chrome’s CDP cookie API, which can set HttpOnly and Secure cookies — unlike document.cookie from JavaScript. This is the recommended approach for authentication tokens and session cookies.
Practical recipes for common VoidCrawl tasks that go beyond basic navigation.
VoidCrawl exposes Chrome’s CDP cookie API directly through set_cookie(), get_cookies(), and delete_cookie(). Cookies are set at the CDP level, which means they support HttpOnly and Secure flags — unlike document.cookie.
import asyncio
from voidcrawl import BrowserConfig, BrowserSession
async def main() -> None: async with BrowserSession(BrowserConfig()) as browser: # 1. Navigate to the target domain first so the cookie # is scoped to the right origin. page = await browser.new_page("https://qscrape.dev/robots.txt")
# 2. Set cookies via the CDP API -- they persist for # subsequent navigations within this tab. await page.set_cookie("session_id", "abc123", path="/") await page.set_cookie("locale", "en-US", path="/")
# 3. Navigate to the real target -- cookies are sent. resp = await page.goto("https://qscrape.dev/dashboard") print(f"Status: {resp.status_code}") print(f"HTML length: {len(resp.html)}")
# 4. Verify cookies are set. cookies = await page.get_cookies() for c in cookies: print(f" {c['name']}={c['value']}")
asyncio.run(main())import asyncio
from voidcrawl import BrowserPool, PoolConfig
async def main() -> None: async with BrowserPool(PoolConfig()) as pool: async with pool.acquire() as tab: # Navigate to origin first so the cookie is domain-scoped. await tab.navigate("https://qscrape.dev/robots.txt") await tab.wait_for_navigation()
await tab.set_cookie( "auth_token", "my-secret", path="/", secure=True, http_only=True, )
# Now fetch the actual page -- cookie is included. resp = await tab.goto("https://qscrape.dev/api/data") print(f"{resp.status_code} -- {len(resp.html)} chars")
asyncio.run(main())CDP cookies vs document.cookie
set_cookie() uses Chrome’s CDP cookie API, which can set HttpOnly and Secure cookies — unlike document.cookie from JavaScript. This is the recommended approach for authentication tokens and session cookies.
# Read all cookies for the current page URLcookies = await page.get_cookies()for c in cookies: print(f"{c['name']}={c['value']} (domain={c['domain']})")
# Delete a cookie by nameawait page.delete_cookie("session_id")
# Delete scoped to a specific domainawait page.delete_cookie("locale", domain="qscrape.dev")VoidCrawl ships built-in actions for capturing network requests using the browser’s PerformanceObserver API. Install the observer after navigation — it uses buffered: true to retroactively capture all resources from the current page load.
import asyncio
from voidcrawl import BrowserConfig, BrowserSessionfrom voidcrawl.actions import ( CollectNetworkRequests, InstallNetworkObserver,)
async def main() -> None: async with BrowserSession(BrowserConfig()) as browser: page = await browser.new_page("https://qscrape.dev") await page.wait_for_network_idle()
# Install observer after load -- buffered: true retroactively # captures all resources from the current navigation. await InstallNetworkObserver().run(page) requests = await CollectNetworkRequests(clear=True).run(page)
print(f"Captured {len(requests)} network requests:\n") for r in requests: print(f" [{r['type']:>10}] {r['duration']:>4}ms" f" {r['size']:>6}B {r['name']}")
asyncio.run(main())import asyncio
from voidcrawl import BrowserPool, PoolConfigfrom voidcrawl.actions import ( CollectNetworkRequests, InstallNetworkObserver,)
async def main() -> None: async with BrowserPool(PoolConfig()) as pool: async with pool.acquire() as tab: resp = await tab.goto("https://qscrape.dev") print(f"Page: {resp.url} (status {resp.status_code})")
await InstallNetworkObserver().run(tab) requests = await CollectNetworkRequests().run(tab)
print(f"{len(requests)} request URLs captured") for r in requests: print(f" {r['name']}")
asyncio.run(main())# After navigation, filter for specific resource types:xhr_requests = await page.evaluate_js(""" JSON.stringify( performance.getEntriesByType('resource') .filter(e => e.initiatorType === 'xmlhttprequest' || e.initiatorType === 'fetch') .map(e => ({ url: e.name, status: e.responseStatus, duration: Math.round(e.duration), })) )""")Limitations
The PerformanceObserver API sees all sub-resource requests but cannot modify or block them. For request interception (e.g., blocking analytics), use evaluate_js to override fetch or XMLHttpRequest before navigation.
VoidCrawl’s set_headers() method applies to a single tab — headers do not leak between tabs in a pool. This makes it straightforward to assign different identities per tab.
import asyncioimport itertools
from voidcrawl import BrowserPool, PoolConfig
USER_AGENTS = itertools.cycle([ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",])
LANGUAGES = itertools.cycle(["en-US,en;q=0.9", "ja-JP,ja;q=0.9", "de-DE,de;q=0.9"])
async def fetch_with_identity(pool: BrowserPool, url: str) -> str: async with pool.acquire() as tab: # set_headers() applies only to THIS tab -- other tabs are unaffected. await tab.set_headers({ "User-Agent": next(USER_AGENTS), "Accept-Language": next(LANGUAGES), "X-Request-ID": f"vc-{id(tab)}", })
resp = await tab.goto(url) return resp.html
async def main() -> None: urls = [ "https://qscrape.dev/page/1", "https://qscrape.dev/page/2", "https://qscrape.dev/page/3", ] async with BrowserPool(PoolConfig(tabs_per_browser=3)) as pool: results = await asyncio.gather(*[fetch_with_identity(pool, u) for u in urls]) for url, html in zip(urls, results): print(f"{url} -> {len(html)} chars")
asyncio.run(main())For deeper fingerprint control (timezone, screen dimensions, WebGL), inject JavaScript overrides on each tab before navigation:
async def apply_fingerprint( tab, *, timezone: str, screen_w: int, screen_h: int) -> None: """Override browser fingerprint signals for this tab.""" await tab.evaluate_js(f""" // Override timezone Intl.DateTimeFormat = class extends Intl.DateTimeFormat {{ resolvedOptions() {{ return {{ ...super.resolvedOptions(), timeZone: "{timezone}" }}; }} }};
// Override screen dimensions Object.defineProperty(screen, 'width', {{ get: () => {screen_w} }}); Object.defineProperty(screen, 'height', {{ get: () => {screen_h} }}); """)
async def main() -> None: async with BrowserPool(PoolConfig()) as pool: async with pool.acquire() as tab: await apply_fingerprint( tab, timezone="America/New_York", screen_w=1920, screen_h=1080 ) await tab.set_headers({"Accept-Language": "en-US,en;q=0.9"})
resp = await tab.goto("https://qscrape.dev") print(f"{resp.status_code} -- {len(resp.html)} chars")Wrap your per-tab setup into a custom JsActionNode so it can be composed with other actions in a Flow:
from voidcrawl.actions import ( Flow, GetText, JsActionNode, WaitForSelector, inline_js,)
class SetTimezone(JsActionNode): """Override the browser's reported timezone."""
js = inline_js("""\Intl.DateTimeFormat = class extends Intl.DateTimeFormat { resolvedOptions() { return { ...super.resolvedOptions(), timeZone: __params.timezone }; }};return __params.timezone;""")
def __init__(self, timezone: str) -> None: self.timezone = timezone
# Compose into a flow:flow = ( Flow() .add(SetTimezone("Europe/Berlin")) .add(WaitForSelector("h1")) .add(GetText("h1")))
# result = await flow.run(tab)Design note
VoidCrawl intentionally does not have a middleware/plugin system. The Actions framework and evaluate_js() provide composable, per-tab customisation without hidden global state. If you need the same setup on every tab, wrap it in a helper function that runs after each pool.acquire().
goto() returns a PageResponse that tells you the HTTP status, final URL, and whether redirects occurred — before you parse the HTML:
async with pool.acquire() as tab: resp = await tab.goto("https://qscrape.dev/old-page")
if resp.redirected: print(f"Redirected to: {resp.url}")
if resp.status_code and resp.status_code >= 400: print(f"HTTP {resp.status_code} -- skipping") else: print(f"Got {len(resp.html)} chars from {resp.url}")PageResponse attributes:
| Attribute | Type | Description |
|---|---|---|
html | str | Full outer HTML after network idle |
url | str | Final URL after any redirects |
status_code | int | None | HTTP status of the document response, or None when served from cache/service worker |
redirected | bool | True when at least one HTTP redirect occurred |
status_code is None when the page was served from cache or a service worker (no network response):
if resp.status_code is None: print("Served from cache -- no HTTP status available")set_headers() overrides HTTP headers at the CDP level after stealth patches are applied. The two are complementary — stealth handles Chrome flags and JS patches, while set_headers() controls what the server sees in HTTP:
from voidcrawl import BrowserConfig, BrowserSession
async with BrowserSession(BrowserConfig(stealth=True)) as browser: page = await browser.new_page("about:blank")
# Stealth is already active -- navigator.webdriver is undefined, # automation flags are stripped. Now layer custom HTTP headers: await page.set_headers({ "Accept-Language": "ja-JP,ja;q=0.9", "Referer": "https://www.google.com/", })
resp = await page.goto("https://waf-protected-site.com") print(f"Status: {resp.status_code}, {len(resp.html)} chars")See also: Stealth Mode for details on what stealth mode patches and why.