Web scraping hasn't gotten easier in 2026. Sites are smarter about detecting bots, JavaScript renders more content than ever, and rate limits are tighter. But the fundamentals still work โ you just need to pick the right tool for each situation.
This guide covers three approaches I use daily, from simplest to most powerful. Each one has a real use case and copy-paste code.
Static HTML pages. Fast, lightweight, no browser needed.
JavaScript-heavy SPAs. Full browser control, stealth mode.
Platform-specific. Maintained, handles edge cases, cloud-ready.
The classic. If the data you need is in the HTML source (right-click โ View Source โ it's there), this is all you need. No browser, no JavaScript execution, just HTTP requests and HTML parsing.
#!/usr/bin/env python3
"""Scrape static pages with requests + BeautifulSoup."""
import time, json
from dataclasses import dataclass, asdict
import httpx # pip install httpx
from selectolax.parser import HTMLParser # pip install selectolax (faster than BS4)
@dataclass
class Article:
title: str
url: str
author: str
summary: str
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml",
}
def scrape_articles(base_url: str, pages: int = 3) -> list[Article]:
"""Scrape article listings from a blog."""
articles = []
client = httpx.Client(headers=HEADERS, follow_redirects=True, timeout=15)
for page in range(1, pages + 1):
url = f"{base_url}?page={page}"
print(f"โ Fetching {url}")
resp = client.get(url)
resp.raise_for_status()
tree = HTMLParser(resp.text)
for card in tree.css("article.post-card"):
title_el = card.css_first("h2 a")
author_el = card.css_first(".author-name")
summary_el = card.css_first(".post-excerpt")
if title_el:
articles.append(Article(
title=title_el.text(strip=True),
url=title_el.attributes.get("href", ""),
author=author_el.text(strip=True) if author_el else "Unknown",
summary=summary_el.text(strip=True) if summary_el else "",
))
time.sleep(1.5) # Be polite โ always add delays between requests
client.close()
return articles
if __name__ == "__main__":
results = scrape_articles("https://example-blog.com/articles")
print(json.dumps([asdict(a) for a in results], indent=2))
print(f"\nโ Scraped {len(results)} articles")
selectolax wraps the Modest engine in C and is 10-20x faster than BeautifulSoup for CSS selector queries. The API is slightly different (.css() instead of .select(), .text() instead of .get_text()), but for scraping at any scale, the speed difference matters.
When that happens, you need a real browser.
Modern SPAs (React, Next.js, Vue) render content client-side. The HTML source is basically empty โ all the data loads via JavaScript. For these, you need a headless browser.
Playwright is my go-to over Selenium. It's faster, has better async support, auto-waits for elements, and the stealth plugin makes it harder to detect.
#!/usr/bin/env python3
"""Scrape JavaScript-heavy pages with Playwright."""
import asyncio, json
from playwright.async_api import async_playwright
async def scrape_spa(url: str, item_selector: str, fields: dict) -> list[dict]:
"""
Scrape a JS-rendered page.
Args:
url: Page URL
item_selector: CSS selector for each item container
fields: Dict of {field_name: css_selector} relative to each item
"""
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=True,
args=[
"--disable-blink-features=AutomationControlled",
"--no-sandbox",
]
)
context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 Chrome/125.0.0.0 Safari/537.36",
locale="en-US",
)
page = await context.new_page()
# Block unnecessary resources to speed things up
await page.route("**/*.{png,jpg,jpeg,gif,svg,woff,woff2}",
lambda route: route.abort())
print(f"โ Loading {url}")
await page.goto(url, wait_until="networkidle", timeout=30000)
# Wait for actual content to render
await page.wait_for_selector(item_selector, timeout=10000)
# Optional: scroll to trigger lazy loading
for _ in range(3):
await page.evaluate("window.scrollBy(0, window.innerHeight)")
await asyncio.sleep(0.5)
# Extract data
items = await page.query_selector_all(item_selector)
results = []
for item in items:
row = {}
for field_name, selector in fields.items():
el = await item.query_selector(selector)
if el:
row[field_name] = (await el.inner_text()).strip()
else:
row[field_name] = None
results.append(row)
await browser.close()
return results
# Example: scrape a React-based job board
async def main():
jobs = await scrape_spa(
url="https://example-jobs.com/listings",
item_selector=".job-card",
fields={
"title": "h3",
"company": ".company-name",
"location": ".location",
"salary": ".salary-range",
}
)
print(json.dumps(jobs, indent=2))
print(f"\nโ Found {len(jobs)} jobs")
if __name__ == "__main__":
asyncio.run(main())
async def scroll_to_bottom(page, max_scrolls=20, delay=1.0):
"""Scroll until no new content loads."""
prev_height = 0
for i in range(max_scrolls):
curr_height = await page.evaluate("document.body.scrollHeight")
if curr_height == prev_height:
break
prev_height = curr_height
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await asyncio.sleep(delay)
return i + 1 # number of scrolls performed
pip install playwright
playwright install chromium
navigator.webdriver, canvas fingerprints, and WebGL renderer strings. For production scraping, consider playwright-stealth or undetected-playwright โ they patch the most common detection vectors.
Sometimes writing a scraper from scratch is overkill. If you need data from a specific platform โ Medium, Product Hunt, GitHub Trending, PyPI, npm โ chances are someone already built and maintains a scraper for it.
I maintain 14 open-source scrapers on Apify that handle the hard parts: pagination, rate limits, anti-bot measures, and output formatting. They run in the cloud, so no infrastructure to manage.
#!/usr/bin/env python3
"""Use pre-built Apify actors for platform-specific scraping."""
import httpx, os, time
APIFY_TOKEN = os.environ.get("APIFY_TOKEN", "")
BASE = "https://api.apify.com/v2"
def run_actor(actor_id: str, input_data: dict, timeout: int = 120) -> list[dict]:
"""
Run an Apify actor and return results.
Free tier: 14 actors, no credit card needed.
"""
headers = {"Authorization": f"Bearer {APIFY_TOKEN}"}
# Start the actor run
r = httpx.post(
f"{BASE}/acts/{actor_id}/runs",
headers=headers,
json=input_data,
timeout=30
)
run_id = r.json()["data"]["id"]
print(f"โ Started run {run_id}")
# Poll until finished
for _ in range(timeout // 5):
time.sleep(5)
status = httpx.get(
f"{BASE}/actor-runs/{run_id}", headers=headers
).json()["data"]["status"]
if status == "SUCCEEDED":
break
elif status in ("FAILED", "ABORTED"):
raise RuntimeError(f"Actor run {status}")
print(f" Status: {status}...")
# Fetch results from dataset
dataset_id = httpx.get(
f"{BASE}/actor-runs/{run_id}", headers=headers
).json()["data"]["defaultDatasetId"]
items = httpx.get(
f"{BASE}/datasets/{dataset_id}/items?format=json",
headers=headers
).json()
return items
# Example: scrape Medium articles by topic
if __name__ == "__main__":
articles = run_actor(
actor_id="openclawmara/medium-article-scraper",
input_data={
"topic": "artificial-intelligence",
"maxArticles": 50,
}
)
for a in articles[:5]:
print(f" {a.get('title', 'N/A')} โ {a.get('author', 'N/A')}")
print(f"\nโ Got {len(articles)} articles")
Full list: apify.com/openclawmara
Getting blocked is the #1 scraping problem. Here's what actually works in 2026:
import random
USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 Chrome/125.0.0.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/125.0.0.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/124.0.0.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) Gecko/20100101 Firefox/126.0",
]
def get_headers():
return {
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
import random, time
def polite_sleep(min_sec=1.0, max_sec=3.0):
"""Random delay between requests โ mimics human browsing."""
delay = random.uniform(min_sec, max_sec)
time.sleep(delay)
return delay
# Bad: new connection every request
for url in urls:
httpx.get(url, headers=headers) # New TCP + TLS handshake each time
# Good: reuse connections like a browser does
with httpx.Client(headers=headers, follow_redirects=True) as client:
for url in urls:
resp = client.get(url) # Reuses TCP connection
polite_sleep()
from urllib.robotparser import RobotFileParser
def can_scrape(url: str, user_agent: str = "*") -> bool:
"""Check robots.txt before scraping."""
from urllib.parse import urlparse
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
rp = RobotFileParser()
rp.set_url(robots_url)
try:
rp.read()
return rp.can_fetch(user_agent, url)
except:
return True # If robots.txt is unreachable, proceed cautiously
Web scraping exists in a legal gray area. Here's how to stay on the right side:
Retry-After headers.requests + selectolaxPlaywrightThe AI Agent Toolkit includes ready-to-use scrapers for common platforms, data processing pipelines, and automation scripts โ 50+ tools in one package.
Plus 14 free scrapers on Apify that you can run right now, no setup needed.
Get the Toolkit โ $19More articles: 5 Python Scripts Every Developer Should Have ยท Build a RAG Pipeline in Python